# Loading the libraries
library(tidyverse)
library(lubridate)
library(stringr)
library(pROC)
library(rpart)
library(ROCR)
library(C50)
library(caret)
library(ranger)
lcdf <- read_csv('/Users/darshan/Documents/IDS 572 Data Mining/Assignment 1/lcDataSample.csv')
# Checking number of rows and columns in the lc dataframe
paste0('The number of rows are = ', nrow(lcdf))
## [1] "The number of rows are = 110000"
paste0('The number of columns are = ',ncol(lcdf))
## [1] "The number of columns are = 145"
lcdf %>% group_by(loan_status) %>% tally()
## # A tibble: 6 × 2
## loan_status n
## <chr> <int>
## 1 Charged Off 15377
## 2 Current 17
## 3 Fully Paid 94567
## 4 In Grace Period 2
## 5 Late (16-30 days) 1
## 6 Late (31-120 days) 36
paste0("Since there are values apart from the target - fullly paid and charged off we will keep only fully paid and charged off loans from the target variable.
#Filtering the dataframe and updating it to the same dataframe")
## [1] "Since there are values apart from the target - fullly paid and charged off we will keep only fully paid and charged off loans from the target variable.\n#Filtering the dataframe and updating it to the same dataframe"
### Since there are values apart from the target - fullly paid and charged off we will keep only fully paid and charged off loans from the target variable.
#Filtering the dataframe and updating it to the same dataframe
lcdf <- lcdf %>% filter(loan_status == "Fully Paid" | loan_status == "Charged Off")
lcdf %>% group_by(loan_status) %>% tally()
## # A tibble: 2 × 2
## loan_status n
## <chr> <int>
## 1 Charged Off 15377
## 2 Fully Paid 94567
loan_status_count <- lcdf %>% group_by(loan_status) %>% count()
pct <- round(loan_status_count$n/sum(loan_status_count$n)*100)
lbls <- paste(loan_status_count$loan_status, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(loan_status_count$n, labels = lbls, main="Percentage of Loans with Loan Status")
We will create a box plot to visualize the spread of the interest rate
summary(lcdf$int_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.32 8.90 11.99 12.05 14.48 28.99
ggplot(lcdf, aes( x = int_rate)) + geom_boxplot() +
xlab("Interest Rate ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
25 Percentile of loans give less than 8.9% interest rate. Median of the interest rate of all loans in 11.99%. The interest rate can go as high as 28.99 % in some case.The interest rate when higher can be a high risk loan. This interest seems really active to invest in. Very few investment products give an interest of 12%.
ggplot(lcdf, aes( x = home_ownership)) + geom_bar(colour="black", fill="white") +ggtitle("Number of Loans By Homeownerships") + xlab("Different Types of Homeownership") + ylab("Number of Loans ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
Most borrowers are not owning a home. Most of loans were given to people who have mortgaged and rented house.
Loans also have different grade and we would want to see how many of them are present in each grade along with loan status
lcdf %>% group_by(grade) %>% tally()
## # A tibble: 7 × 2
## grade n
## <chr> <int>
## 1 A 24854
## 2 B 37865
## 3 C 29145
## 4 D 13455
## 5 E 3790
## 6 F 753
## 7 G 82
Adding the loan status to check on loan status and grade together
table(lcdf$loan_status, lcdf$grade)
##
## A B C D E F G
## Charged Off 1369 4264 5206 3165 1090 252 31
## Fully Paid 23485 33601 23939 10290 2700 501 51
Some loans have been charged off in the grade ‘A’ Some loans in grade ‘G’ have been fully paid. Let us look at the default percentage of each grade to get a better picture.
lcdf %>% group_by(grade) %>% summarise(TotalLoans=n(), FullyPaid=sum(loan_status=="Fully Paid"), ChargedOff=sum(loan_status=="Charged Off"), default_percentage = ChargedOff/TotalLoans*100)
## # A tibble: 7 × 5
## grade TotalLoans FullyPaid ChargedOff default_percentage
## <chr> <int> <int> <int> <dbl>
## 1 A 24854 23485 1369 5.51
## 2 B 37865 33601 4264 11.3
## 3 C 29145 23939 5206 17.9
## 4 D 13455 10290 3165 23.5
## 5 E 3790 2700 1090 28.8
## 6 F 753 501 252 33.5
## 7 G 82 51 31 37.8
# Number of Loans, Sum of Loan Amout, Mean Loan Amount Mean Int Rate by Grade
lcdf %>% group_by(grade) %>% summarise(numberOfLoans=n(), TotLoanAmt=sum(loan_amnt),MeanLoanAmt=mean(loan_amnt),defaults=sum(loan_status=="Charged Off"), defaultRate=defaults/numberOfLoans, default_percentage = defaultRate*100,MeanIntRate=mean(int_rate),stdInterest=sd(int_rate), minInt = min(int_rate),maxInt=max(int_rate),avgLoanAMt=mean(loan_amnt), sumPmnt=sum(total_pymnt),avgPmnt=mean(total_pymnt))
## # A tibble: 7 × 14
## grade numberOfLoans TotLoanAmt MeanLoanAmt defaults defaultRate
## <chr> <int> <dbl> <dbl> <int> <dbl>
## 1 A 24854 356633075 14349. 1369 0.0551
## 2 B 37865 473544750 12506. 4264 0.113
## 3 C 29145 351136175 12048. 5206 0.179
## 4 D 13455 160060000 11896. 3165 0.235
## 5 E 3790 45192200 11924. 1090 0.288
## 6 F 753 7104350 9435. 252 0.335
## 7 G 82 947125 11550. 31 0.378
## # … with 8 more variables: default_percentage <dbl>, MeanIntRate <dbl>,
## # stdInterest <dbl>, minInt <dbl>, maxInt <dbl>, avgLoanAMt <dbl>,
## # sumPmnt <dbl>, avgPmnt <dbl>
# Loan Amount Distribution
ggplot(lcdf, aes( x = loan_amnt)) + geom_histogram(aes(y=..density..), colour="black", fill="white", bins=15)+ geom_density(alpha=.2, fill="#FF6666") + ggtitle("Distribution of Loan Amount Changing Bins ") + xlab("Loan Amount ") + ylab("Number of Loans ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Loan Amount Distribution by Grade
ggplot(lcdf, aes( x = loan_amnt)) + geom_histogram(aes(fill=grade)) + ggtitle("Distribution of Loan Amount With Grade") + xlab("Loan Amount ") + ylab("Number of Loans ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Let us look at the distribution
ggplot(lcdf, aes( x = loan_amnt)) + geom_boxplot(aes(fill=grade)) +
xlab("Loan Amount ") + ylab("Grades of Each Loan ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Let us look at the loan amount along with loan status
ggplot(lcdf, aes( x = loan_amnt)) + geom_histogram(aes(y=..density..), colour="black", fill="white", bins=15)+ geom_density(alpha=.2, fill="#FF6666") + ggtitle("Distribution of Number of Loans, Loan Amount with Status ") + facet_wrap(~loan_status) + xlab("Loan Amount ") + ylab("Number of Loans ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Let us look at the Interest Rate
ggplot(lcdf, aes( x = int_rate)) + geom_histogram(aes(y=..density..), colour="black", fill="white")+ geom_density(alpha=.2, fill="#FF6666") +ggtitle("Distribution of Interest Rate") + xlab("Interest Rate ") + ylab("Number of Loans ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Interest Rate with Grade
ggplot(lcdf, aes( x = int_rate)) + geom_histogram(aes(fill=grade)) + ggtitle("Distribution of Interest Rate With Grade") + xlab("Interest Rate ") + ylab("Number of Loans ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
The default rate percentage increases from Grade A to H. Average Payments are more than average average loan amount in each grade. Yes these numbers surprise us-> when compare the returns of the different grade with NASDAQ for last 16 years (2007-2022 Current Year) which yields about 16.7 percent average - there are some grades which are not able to beat the market. Considering both NASDAQ and P2P market are highly volatile and even further risks in P2P we would expect them to give more average returns. If we had to invest in only one grade - depending on the risk apetite we would have chosen # grade C. Although it has a low average interest rate compared to other higher risk grades(D,E,F), it has an average interest rate of 14% which is sufficient to double the money in 5 years time.
The loan amount varies from 400 to 38,000. Most number of loans are of the amount approximately 12,000$. Most Grade G loans are of lesser amounts. he number of charged off loans are less in overall number, and it is evident in the graph # Both these distribution seem to be left skewed. In an ideal case these would have been normally distributed.There are loans which are higher than 30,000 and still paid.Also, there are loans of less than 10,000 and charged off
We can see that the average interest rate is higher in higher grades of loans. Intuitively we might be more interested in higher rates, however they come with trade off higher risk. The graph shows that the interest rate varies from 0-28. Most number of loans ~13-14% interest rate. Trend of interest rate with grade - Lower Grade corresponds to lower interest rate. Intutivetly, we should prefer a lower grade loan if both grades give same interest rate. The most common loan is grade B loan with a ~13% interest
#Look at the variable summaries -- focus on a subset of the variables of interest in your analyses & modeling
#lcdf %>% select_if(is.numeric) %>% summary()
# Let us look at the outliers in loan amount -
ggplot(lcdf, aes( x = loan_amnt)) + geom_boxplot(aes(fill=grade)) +
xlab("Loan Amount ") + ylab("Grades of Each Loan ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Let us look at the annual income
ggplot(lcdf, aes( x = annual_inc)) + geom_histogram(aes(y=..density..), colour="black", fill="white")+ geom_density(alpha=.2, fill="#FF6666") + ggtitle("Distribution of Number of Loans With Annual Income ") + xlab("Annual Income ") + ylab("Number of Loans ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Let us check how are these very high income associated with loans status
ggplot(lcdf, aes( x = annual_inc, y=loan_status)) + geom_boxplot(aes(fill=loan_status)) + ggtitle("Distribution of Number of Loans With Annual Income By Loan Status - Before Removing Extreme Outliers") + xlab("Annual Income ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
Yes, there are outliers. However to remove them we should check the frequency and also see the business use case these outliers might be justified given the fact that loan amount can vary.
For annual income the data seems to really skewed towards the left, very few loans have the income more than 1.5 Milliion. A person coming to lending club for loan with income more than 1.5 million might be suspicious. It is logical to think about why would a person need a loan with 1.5 million income. Hence we will remove thes 9 observation. We could alternatively assignment a maximum value, since we have 110k data point we can remove 9 rows
The very high income cases are for paid-off loans. # We can exclude them, however we do so we might not have a decision tree model which predicts the hypothesis that high income people pay off the loan in most cases.Going with the use case we will discard and keep them in a separate dataframe.We shall observe what difeerence it makes to out models in the later part. Compared to the 110k data size the number looks really small, hence we will remove these
## Chunk 12 <For knitting of .rmd file>
lcdf <- lcdf %>% filter(annual_inc <= 1500000)
# Let us look at the new distribution of annual income after outlier removal
ggplot(lcdf, aes( x = annual_inc, y=loan_status)) + geom_boxplot(aes(fill=loan_status)) + ggtitle("Distribution of Number of Loans With Annual Income By Loan Status - After Removing Extreme Outliers ") + xlab("Annual Income ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
The plot looks much cleaner, and inference can be drawn from the the above as we have removed those outliers.We might argue to the fact that data still has outliers, but removing the ones above 1.5 IQR now we might lose essential information. However this was the case when we removed observations above 1.5 million, but they were just 9 observations in the 109k observations.
Ratio of current balance/ high credit limit.
### Chunk 13
ggplot(lcdf, aes( x = revol_util)) + geom_boxplot() + ggtitle("Distribution of Revol Util ") + xlab("Revol Util") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Identified outliers by boxplot
out_ru <- boxplot(lcdf$revol_util, plot=FALSE)$out
#Let us look at these examples
out_ru_i <-which(lcdf$revol_util %in% out_ru)
lcdf[out_ru_i,]
## # A tibble: 9 × 145
## id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate
## <lgl> <lgl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 NA NA 3475 3475 3475 36 months 18.9
## 2 NA NA 12600 12600 12600 36 months 8.39
## 3 NA NA 15000 15000 15000 36 months 12.0
## 4 NA NA 20000 20000 20000 36 months 11.7
## 5 NA NA 35000 35000 35000 36 months 19.5
## 6 NA NA 5000 5000 5000 36 months 9.49
## 7 NA NA 35000 35000 35000 36 months 25.8
## 8 NA NA 25000 25000 25000 36 months 12.6
## 9 NA NA 10000 10000 10000 36 months 9.99
## # … with 138 more variables: installment <dbl>, grade <chr>, sub_grade <chr>,
## # emp_title <chr>, emp_length <chr>, home_ownership <chr>, annual_inc <dbl>,
## # verification_status <chr>, issue_d <dttm>, loan_status <chr>,
## # pymnt_plan <chr>, url <lgl>, desc <lgl>, purpose <chr>, title <chr>,
## # zip_code <chr>, addr_state <chr>, dti <dbl>, delinq_2yrs <dbl>,
## # earliest_cr_line <chr>, inq_last_6mths <dbl>, mths_since_last_delinq <dbl>,
## # mths_since_last_record <dbl>, open_acc <dbl>, pub_rec <dbl>, …
# We will remove these 9 outliers
lcdf <- lcdf [-out_ru_i, ]
# Recoveries - post a loan charged off gross amount recovered
# Checking if recoveries are only for charged off loans
lcdf %>% group_by(loan_status) %>%summarise(Rec=sum(recoveries))
## # A tibble: 2 × 2
## loan_status Rec
## <chr> <dbl>
## 1 Charged Off 14231328.
## 2 Fully Paid 0
lcdf %>% group_by(loan_status) %>%summarise(Sum_Rec=sum(recoveries), TotPmnt=sum(total_pymnt), total_rec_prncp=sum(total_rec_prncp), total_rec_int=sum(total_rec_int), total_rec_late_fee=sum(total_rec_late_fee))
## # A tibble: 2 × 6
## loan_status Sum_Rec TotPmnt total_rec_prncp total_rec_int total_rec_late_…
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Charged Off 14231328. 1.21e8 79860072. 26993115. 58197.
## 2 Fully Paid 0 1.39e9 1204556474. 183100781. 72092.
Hence recoveries are only for charged off loans, this also goes with the general idea of recovery of credit for any loan it will only be for the charged off if it has to be there. Sometimes recovery might not be present for the charged off loans as well. This is the case where there has been a loss.
he way to calulate recovered amount in terms of charged loans =‘total_pymnt’= ‘total_rec_prncp’+‘total_rec_int’+‘total_rec_late_fee’+‘recoveries’
# Let us look at some columns
lcdf %>% select(loan_status, int_rate, funded_amnt, total_pymnt) %>% head()
## # A tibble: 6 × 4
## loan_status int_rate funded_amnt total_pymnt
## <chr> <dbl> <dbl> <dbl>
## 1 Fully Paid 23.0 4400 6120.
## 2 Fully Paid 22.0 5850 6377.
## 3 Fully Paid 6.24 5000 5496.
## 4 Fully Paid 15.0 1600 1840.
## 5 Fully Paid 9.17 16000 18128.
## 6 Fully Paid 8.18 3000 3394.
# We will use the following to calculate annualized return
#annReturn = [(Total Payment - funded amount)/funded amount]*12/36*100
lcdf$annRet <- ((lcdf$total_pymnt -lcdf$funded_amnt)/lcdf$funded_amnt)*(12/36)*100
# Returns for charged off and fully paid loans
lcdf %>% group_by(loan_status) %>% summarise(avgRet=mean(annRet), stdRet=sd(annRet), minRet=min(annRet), maxRet=max(annRet))
## # A tibble: 2 × 5
## loan_status avgRet stdRet minRet maxRet
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Charged Off -12.0 9.35 -33.3 14.4
## 2 Fully Paid 5.16 2.43 0 18.0
# Do charged off loans have negative returns -
lcdf %>% select(loan_status, int_rate, funded_amnt, total_pymnt, annRet) %>% filter(annRet < 0) %>% count(loan_status)
## # A tibble: 1 × 2
## loan_status n
## <chr> <int>
## 1 Charged Off 13539
What is surprising here is the fact that the avg return rate differ significantly fro average interest rate . The minimum return rate for some loans which are fully paid can go as minimum as 0. This might be because some loans which are paid off are paid off earlier than the expected date.
## Chunk 16
# Fully Paid
lcdf %>% filter( loan_status == "Fully Paid") %>% group_by(grade) %>% summarise(nLoans=n(), avgInterest= mean(int_rate), avgLoanAmt=mean(loan_amnt), avgPmnt=mean(total_pymnt), avgRet=mean(annRet), minRet=min(annRet), maxRet=max(annRet))
## # A tibble: 7 × 8
## grade nLoans avgInterest avgLoanAmt avgPmnt avgRet minRet maxRet
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 23480 7.19 14383. 15772. 3.19 0 5.19
## 2 B 33596 10.8 12546. 14357. 4.80 0.000333 8.18
## 3 C 23935 13.9 12038. 14244. 6.06 0.0134 10.5
## 4 D 10288 17.3 11745. 14401. 7.53 0 11.9
## 5 E 2700 20.0 11602. 14549. 8.64 0.0194 14.0
## 6 F 501 23.9 9134. 11975. 10.6 0.0255 18.0
## 7 G 50 26.5 10512 14144. 11.7 0.422 17.0
# Adding subgrade
lcdf %>% filter( loan_status == "Fully Paid") %>% group_by(sub_grade) %>% summarise(nLoans=n(), avgInterest= mean(int_rate), avgLoanAmt=mean(loan_amnt), avgPmnt=mean(total_pymnt), avgRet=mean(annRet), minRet=min(annRet), maxRet=max(annRet))
## # A tibble: 35 × 8
## sub_grade nLoans avgInterest avgLoanAmt avgPmnt avgRet minRet maxRet
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A1 3934 5.70 14098. 15167. 2.50 0.00518 3.34
## 2 A2 3745 6.43 13958. 15143. 2.82 0.0000476 3.72
## 3 A3 3851 7.13 14476. 15862. 3.16 0.0000208 4.20
## 4 A4 5388 7.52 14749. 16239. 3.35 0 4.70
## 5 A5 6562 8.28 14441. 16056. 3.69 0.00840 5.19
## 6 B1 6285 8.96 12935. 14480. 3.97 0.00909 6.62
## 7 B2 6922 10.0 12912. 14643. 4.44 0.0102 6.52
## 8 B3 7324 11.0 12545. 14387. 4.89 0.0296 7.01
## 9 B4 6829 11.9 12263. 14219. 5.26 0.000333 8.00
## 10 B5 6236 12.4 12058. 14033. 5.44 0.0132 8.18
## # … with 25 more rows
# Charged Off
lcdf %>% filter( loan_status == "Charged Off") %>% group_by(grade) %>% summarise(nLoans=n(), avgInterest= mean(int_rate), avgLoanAmt=mean(loan_amnt), avgPmnt=mean(total_pymnt), avgRet=mean(annRet), minRet=min(annRet), maxRet=max(annRet))
## # A tibble: 7 × 8
## grade nLoans avgInterest avgLoanAmt avgPmnt avgRet minRet maxRet
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 1369 7.49 13747. 8781. -12.1 -32.3 5.80
## 2 B 4264 11.0 12195. 7939. -11.7 -33.3 13.8
## 3 C 5206 14.0 12085. 7792. -11.9 -33.3 9.54
## 4 D 3164 17.2 12376. 7719. -12.6 -33.3 11.3
## 5 E 1090 20.0 12722. 7858. -12.6 -33.3 11.7
## 6 F 252 23.9 10032. 5931. -12.2 -32.0 14.4
## 7 G 31 26.4 12469. 7056. -15.5 -28.6 4.86
# Adding Subgrade
lcdf %>% filter( loan_status == "Charged Off") %>% group_by(sub_grade) %>% summarise(nLoans=n(), avgInterest= mean(int_rate), avgLoanAmt=mean(loan_amnt), avgPmnt=mean(total_pymnt), avgRet=mean(annRet), minRet=min(annRet), maxRet=max(annRet))
## # A tibble: 34 × 8
## sub_grade nLoans avgInterest avgLoanAmt avgPmnt avgRet minRet maxRet
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A1 104 5.69 13727. 8584. -12.5 -32.3 2.00
## 2 A2 161 6.43 13333. 8150. -12.9 -31.4 2.74
## 3 A3 193 7.14 13754. 8798. -12.3 -31.3 4.05
## 4 A4 385 7.51 13878. 8969. -11.6 -32.3 4.79
## 5 A5 526 8.27 13778. 8869. -12.0 -32.3 5.80
## 6 B1 575 8.95 11918. 7733. -11.7 -31.4 5.35
## 7 B2 775 9.98 12405. 8080. -11.5 -33.3 6.19
## 8 B3 958 11.0 12448. 8058. -11.8 -33.3 5.72
## 9 B4 928 11.8 11966. 7751. -11.7 -33.3 6.74
## 10 B5 1028 12.4 12163. 8007. -11.6 -33.3 13.8
## # … with 24 more rows
# Chunk 17
# 2 dates we will use - payment and issue date
head(lcdf[, c("last_pymnt_d", "issue_d")])
## # A tibble: 6 × 2
## last_pymnt_d issue_d
## <chr> <dttm>
## 1 Mar-2018 2015-03-01 00:00:00
## 2 Mar-2015 2014-05-01 00:00:00
## 3 Sep-2018 2015-09-01 00:00:00
## 4 Jun-2015 2014-05-01 00:00:00
## 5 Jun-2017 2015-05-01 00:00:00
## 6 Oct-2018 2015-11-01 00:00:00
# Bringing them to a consistent format
lcdf$last_pymnt_d<-paste(lcdf$last_pymnt_d, "-01", sep = "")
lcdf$last_pymnt_d<-parse_date_time(lcdf$last_pymnt_d, "myd")
#Check their format now
head(lcdf[, c("last_pymnt_d", "issue_d")])
## # A tibble: 6 × 2
## last_pymnt_d issue_d
## <dttm> <dttm>
## 1 2018-03-01 00:00:00 2015-03-01 00:00:00
## 2 2015-03-01 00:00:00 2014-05-01 00:00:00
## 3 2018-09-01 00:00:00 2015-09-01 00:00:00
## 4 2015-06-01 00:00:00 2014-05-01 00:00:00
## 5 2017-06-01 00:00:00 2015-05-01 00:00:00
## 6 2018-10-01 00:00:00 2015-11-01 00:00:00
# Creating actual term column - If loan is charged off by default - 3 years
lcdf$actualTerm <- ifelse(lcdf$loan_status=="Fully Paid", as.duration(lcdf$issue_d %--% lcdf$last_pymnt_d)/dyears(1), 3)
# We know using simple interest Total = principle + pnr/100
# Hence r = (Total - principle)/principle * 100/n
# Then, considering this actual term, the actual annual return is
lcdf$actualReturn <- ifelse(lcdf$actualTerm>0, ((lcdf$total_pymnt -lcdf$funded_amnt)/lcdf$funded_amnt)*(1/lcdf$actualTerm)*100, 0)
lcdf %>% select(loan_status, int_rate, funded_amnt, total_pymnt, annRet, actualTerm, issue_d,last_pymnt_d) %>% head()
## # A tibble: 6 × 8
## loan_status int_rate funded_amnt total_pymnt annRet actualTerm
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Fully Paid 23.0 4400 6120. 13.0 3.00
## 2 Fully Paid 22.0 5850 6377. 3.00 0.832
## 3 Fully Paid 6.24 5000 5496. 3.31 3.00
## 4 Fully Paid 15.0 1600 1840. 4.99 1.08
## 5 Fully Paid 9.17 16000 18128. 4.43 2.09
## 6 Fully Paid 8.18 3000 3394. 4.37 2.92
## # … with 2 more variables: issue_d <dttm>, last_pymnt_d <dttm>
# Checking the same for charged off loans
lcdf %>% select(loan_status, int_rate, funded_amnt, total_pymnt, annRet, actualTerm, actualReturn) %>% filter(loan_status=="Charged Off") %>% head()
## # A tibble: 6 × 7
## loan_status int_rate funded_amnt total_pymnt annRet actualTerm actualReturn
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Charged Off 13.4 6500 2701. -19.5 3 -19.5
## 2 Charged Off 13.4 15000 9898. -11.3 3 -11.3
## 3 Charged Off 14.0 9000 6765. -8.28 3 -8.28
## 4 Charged Off 10.2 5000 3013. -13.2 3 -13.2
## 5 Charged Off 17.9 10575 5295. -16.6 3 -16.6
## 6 Charged Off 7.9 27000 3971. -28.4 3 -28.4
# Chunk 17
# For cost-based performance, we may want to see the average interest rate, and the average of proportion of loan amount paid back, grouped by loan_status
lcdf%>% group_by(loan_status) %>% summarise( meanintRate=mean(int_rate), meanRet=mean((total_pymnt-funded_amnt)/funded_amnt),meanRetPer=mean((total_pymnt-funded_amnt)/funded_amnt)*100, sumTotalpymt = sum(total_pymnt), sumFundedamnt = sum(funded_amnt), term=mean(actualTerm) )
## # A tibble: 2 × 7
## loan_status meanintRate meanRet meanRetPer sumTotalpymt sumFundedamnt term
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Charged Off 13.9 -0.361 -36.1 121142712. 189668875 3
## 2 Fully Paid 11.8 0.155 15.5 1387729348. 1204556475 2.14
# Checking the same by grade along with loan status
lcdf%>% group_by(loan_status, grade) %>% summarise( intRate=mean(int_rate),meanRet=mean((total_pymnt-funded_amnt)/funded_amnt),
meanRetPer=mean((total_pymnt-funded_amnt)/funded_amnt)*100,sumTotalpymt = sum(total_pymnt), sumFundedamnt = sum(funded_amnt), term=mean(actualTerm) )
## # A tibble: 14 × 8
## # Groups: loan_status [2]
## loan_status grade intRate meanRet meanRetPer sumTotalpymt sumFundedamnt term
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Charged Off A 7.49 -0.362 -36.2 12021258. 18819050 3
## 2 Charged Off B 11.0 -0.350 -35.0 33852516. 51996225 3
## 3 Charged Off C 14.0 -0.357 -35.7 40567130. 62913550 3
## 4 Charged Off D 17.2 -0.377 -37.7 24423334. 39158675 3
## 5 Charged Off E 20.0 -0.378 -37.8 8565088. 13866675 3
## 6 Charged Off F 23.9 -0.365 -36.5 1494657. 2528175 3
## 7 Charged Off G 26.4 -0.466 -46.6 218729. 386525 3
## 8 Fully Paid A 7.19 0.0957 9.57 370317452. 337701500 2.21
## 9 Fully Paid B 10.8 0.144 14.4 482339194. 421484600 2.16
## 10 Fully Paid C 13.9 0.182 18.2 340925023. 288108225 2.08
## 11 Fully Paid D 17.3 0.226 22.6 148159630. 120834850 2.06
## 12 Fully Paid E 20.0 0.259 25.9 39281121. 31325525 2.03
## 13 Fully Paid F 23.9 0.317 31.7 5999721. 4576175 2.10
## 14 Fully Paid G 26.5 0.350 35.0 707207. 525600 2.12
# For Fully Paid loans, is the average value of totRet what you'd expect, considering the average value for intRate?
lcdf %>% group_by(loan_status) %>% summarise(avgInt=mean(int_rate), avgRet=mean(actualReturn),avgTerm=mean(actualTerm))
## # A tibble: 2 × 4
## loan_status avgInt avgRet avgTerm
## <chr> <dbl> <dbl> <dbl>
## 1 Charged Off 13.9 -12.0 3
## 2 Fully Paid 11.8 8.02 2.14
We also observe the actual term for loan is not 3 years in case of fully paid loans. Indeed some loans are fully paid earlier than 3 years. # Charged off loans are expected to have negative return irrespective of the grade. Higher graded have higher loss / negative mean return rate. But the distribution of the return is only between -0.36 - -0.466 in case of charged off loans.In case of fully paid loans, higher grades give higher average return. The range of return is higher 0.09 to 0.349. We would want our investor to get the best returns and minimize losses at the same time.
# Chunk 21
ggplot(lcdf %>% filter(loan_status=='Fully Paid'), aes( x = actualTerm)) + geom_histogram(aes(y=..density..), colour="black", fill="white", bins=50) +ggtitle("Distribution of Actual Term ") + xlab("Actual Term ") + ylab("Number of Loans ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
ggplot(lcdf %>% filter(loan_status=='Fully Paid'), aes( x = actualTerm, y=grade)) + geom_boxplot(aes(fill=grade)) + ggtitle("Distribution of Actual Term With Loan Grade ")+
xlab("Actual Term ") + ylab("Grade") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Arranging them since they
lcdf$emp_length <- factor(lcdf$emp_length, levels=c("n/a", "< 1 year","1 year","2 years", "3 years" , "4 years", "5 years", "6 years", "7 years" , "8 years", "9 years", "10+ years" ))
# Number of loans in each employment length
ggplot(data = lcdf, aes(x = emp_length)) + geom_bar() + ggtitle("Number of Loans in Each Employement Length ") + xlab("Employement Length ") + ylab("Number of Loans ")+ theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Results in a table
table(lcdf$loan_status, lcdf$emp_length)
##
## n/a < 1 year 1 year 2 years 3 years 4 years 5 years 6 years
## Charged Off 1345 1268 1097 1327 1265 895 983 757
## Fully Paid 5300 7515 6237 8471 7622 5607 5989 4692
##
## 7 years 8 years 9 years 10+ years
## Charged Off 737 724 598 4380
## Fully Paid 4837 4731 3589 29960
# Calculating the proportion of defaults across employment length
lcdf %>% group_by(emp_length) %>% summarise(nLoans=n(), defaults=sum(loan_status=="Charged Off"), defaultPercentage=defaults/nLoans*100, avgIntRate=mean(int_rate), avgLoanAmt=mean(loan_amnt), avgActRet = mean(actualReturn), avgActTerm=mean(actualTerm))
## # A tibble: 12 × 8
## emp_length nLoans defaults defaultPercentage avgIntRate avgLoanAmt avgActRet
## <fct> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 n/a 6645 1345 20.2 12.5 10251. 3.94
## 2 < 1 year 8783 1268 14.4 12.1 12108. 5.01
## 3 1 year 7334 1097 15.0 12.2 12080. 5.11
## 4 2 years 9798 1327 13.5 12.1 12183. 5.39
## 5 3 years 8887 1265 14.2 12.1 12344. 5.22
## 6 4 years 6502 895 13.8 12.1 12661. 5.26
## 7 5 years 6972 983 14.1 12.1 12513. 5.19
## 8 6 years 5449 757 13.9 12.2 12475. 5.25
## 9 7 years 5574 737 13.2 12.1 12656. 5.40
## 10 8 years 5455 724 13.3 12.0 12935. 5.43
## 11 9 years 4187 598 14.3 12.1 12974. 5.17
## 12 10+ years 34340 4380 12.8 11.8 13662. 5.41
## # … with 1 more variable: avgActTerm <dbl>
# Plot for Distribution of Loan Amount with Employment Length
ggplot(lcdf, aes( x = loan_amnt, y=emp_length)) + geom_boxplot(aes(fill=emp_length)) +
xlab("Loan Amount ") + ylab("Employment Length ")+ggtitle("Distribution of Loan Amount with Employement Length") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
The internal percentage of default within a grade differ by emp length. This is a good factor to understand how employment length plays a role in defaults and returns
# Checking number of loans by purpose
lcdf %>% group_by(purpose) %>% tally()
## # A tibble: 13 × 2
## purpose n
## <chr> <int>
## 1 car 1083
## 2 credit_card 27091
## 3 debt_consolidation 63277
## 4 home_improvement 6190
## 5 house 432
## 6 major_purchase 2111
## 7 medical 1170
## 8 moving 755
## 9 other 5684
## 10 renewable_energy 65
## 11 small_business 1117
## 12 vacation 753
## 13 wedding 198
lcdf$purpose <- as.character(lcdf$purpose )
lcdf$purpose <- str_trim(lcdf$purpose )
lcdf$purpose <- as.factor(lcdf$purpose )
lcdf$purpose <- fct_collapse(lcdf$purpose, other = c("wedding","renewable_energy", "other"),NULL = "H")
lcdf %>% group_by(purpose) %>% tally()
## # A tibble: 11 × 2
## purpose n
## <fct> <int>
## 1 car 1083
## 2 credit_card 27091
## 3 debt_consolidation 63277
## 4 home_improvement 6190
## 5 house 432
## 6 major_purchase 2111
## 7 medical 1170
## 8 moving 755
## 9 other 5947
## 10 small_business 1117
## 11 vacation 753
# Get the number of loans by loan purpose
ggplot(data = lcdf, aes(x = purpose)) + geom_bar() + ggtitle("Number of Loans By Purpose") + xlab("Purpose of Loan ") + ylab("Number of Loans ")+ theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
#Plot of loan amount by purpose
ggplot(lcdf, aes( x = loan_amnt, y=purpose)) + geom_boxplot(aes(fill=purpose)) +
xlab("Loan Amount ") + ylab("Pupose of Each Loan ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
# Percentages
lcdf %>% group_by(purpose) %>% summarise(nLoans=n(), defaults=sum(loan_status=="Charged Off"), Default_per = defaults/nLoans)
## # A tibble: 11 × 4
## purpose nLoans defaults Default_per
## <fct> <int> <int> <dbl>
## 1 car 1083 125 0.115
## 2 credit_card 27091 3169 0.117
## 3 debt_consolidation 63277 9253 0.146
## 4 home_improvement 6190 780 0.126
## 5 house 432 70 0.162
## 6 major_purchase 2111 281 0.133
## 7 medical 1170 202 0.173
## 8 moving 755 137 0.181
## 9 other 5947 975 0.164
## 10 small_business 1117 259 0.232
## 11 vacation 753 125 0.166
#Does loan-grade vary by purpose? Which pupose the loan grade fall in?
table(lcdf$purpose, lcdf$grade)
##
## A B C D E F G
## car 310 333 278 117 33 11 1
## credit_card 8852 10795 5464 1686 267 24 3
## debt_consolidation 12641 21988 18031 8128 2121 341 27
## home_improvement 1600 2033 1584 699 226 43 5
## house 51 99 114 92 49 23 4
## major_purchase 571 622 555 272 77 11 3
## medical 105 292 391 239 104 34 5
## moving 39 104 249 218 101 42 2
## other 522 1279 1907 1487 585 151 16
## small_business 92 166 301 324 164 57 13
## vacation 66 149 267 190 63 16 2
#Bivariate analysis of employment length and purpose.
table(lcdf$purpose, lcdf$emp_length)
##
## n/a < 1 year 1 year 2 years 3 years 4 years 5 years
## car 57 90 92 136 87 67 74
## credit_card 1731 2415 1857 2533 2203 1618 1714
## debt_consolidation 3573 4975 4239 5501 5132 3705 3975
## home_improvement 513 330 273 435 425 359 413
## house 14 44 34 48 43 38 33
## major_purchase 135 162 144 215 191 140 161
## medical 88 79 89 115 92 61 75
## moving 50 122 88 86 62 47 60
## other 406 443 403 541 492 334 353
## small_business 29 78 67 122 114 82 73
## vacation 49 45 48 66 46 51 41
##
## 6 years 7 years 8 years 9 years 10+ years
## car 55 57 37 36 295
## credit_card 1346 1314 1247 968 8145
## debt_consolidation 3103 3262 3257 2544 20011
## home_improvement 289 343 310 219 2281
## house 23 19 20 17 99
## major_purchase 123 93 88 68 591
## medical 55 58 66 44 348
## moving 36 23 31 17 133
## other 314 293 297 205 1866
## small_business 69 76 61 38 308
## vacation 36 36 41 31 263
#do those with home-improvement loans own or rent a home? Checking because loan improvement should be with the people who own a home. Very rarely tenant would take a loan for home improvement
table(lcdf$purpose,lcdf$home_ownership)
##
## ANY MORTGAGE NONE OTHER OWN RENT
## car 0 459 0 0 124 500
## credit_card 0 12239 7 1 2800 12044
## debt_consolidation 0 29508 2 2 6052 27713
## home_improvement 0 4682 1 1 968 538
## house 0 121 0 0 54 257
## major_purchase 0 868 0 0 246 997
## medical 0 497 0 0 130 543
## moving 0 128 0 0 38 589
## other 1 2272 0 0 630 3044
## small_business 0 499 0 0 121 497
## vacation 0 263 0 0 81 409
More than half (58 %) of loans were taken for debt consolidation. This follows the the Pareto principle of 80:20 rule, as the top 3 puposes are more than 80% of loan purposes.mall business has higher default percentage. Loan borrowed for smaller business is defaulted the most in terms of percentage.It is also indicative of the fact that borrowers are coming to lending club for small business as they might have been already declined for a loan by bank.he loans show a very similar pattern irrespective of the purpose. Most number of loans in B for some cases , C followed by A grade loans. People with 10 + years of experience are the most common borrower of loan for credit card and debt consolidation. Home improvement loans are more common with 10+ years of experience.car loans are more common with people having 2 years of experience. Which might be reflective of the fact that once people are in job for 2 years they would want to keep a car for which they come to the lending club. We can see that home improvement loans were not the all with those owning the house. This might be because they were doing home improvement in rented house. Also more than 60% home were mortgaged. Also it can happen i might not directly own the house, owned by the partner while person borrowing the loan is doing home improvement over it.We can see the distribution of loan amount by various purposes.We can see that small business loans have significant distribution, fairly wide spred. Implying scales might be diffeerent for the small business.
# num_bc_tl - number number of card
# and num_bc_sats satisfactory card
lcdf$propSatisBankcardAccts <- ifelse(lcdf$num_bc_tl>0, lcdf$num_bc_sats/lcdf$num_bc_tl, 0)
# Let us look at the column created
summary(lcdf$propSatisBankcardAccts)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.429 0.600 0.614 0.800 1.000 4094
# Plot
ggplot(lcdf, aes( x = propSatisBankcardAccts, y=loan_status)) + geom_boxplot(aes(fill=loan_status)) + ggtitle("Distribution of Proportion of Satisfactory Bank Cards") +
xlab("Proportion of Satisfactory Bank Cards ") + ylab(" Loan Status ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
#Another one - lets calculate the length of borrower's history
# i.e time between earliest_cr_line - open of current credit line. The month the borrowers earliers
# issue_d
# Correcting the date format
lcdf$earliest_cr_line<-paste(lcdf$earliest_cr_line, "-01", sep = "")
lcdf$earliest_cr_line<-parse_date_time(lcdf$earliest_cr_line, "myd")
lcdf$earliest_cr_line %>% head()
## [1] "2011-08-01 UTC" "2006-12-01 UTC" "1995-02-01 UTC" "1995-11-01 UTC"
## [5] "2001-02-01 UTC" "2003-01-01 UTC"
lcdf$borrHistory <- as.duration(lcdf$earliest_cr_line %--% lcdf$issue_d ) / dyears(1)
ggplot(lcdf, aes( x = borrHistory, y=loan_status)) + geom_boxplot(aes(fill=loan_status)) +
xlab("Borrower History in Years ") + ylab("Loan Status")+ggtitle("Distribution of Borrower History") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
#Another new attribute: ratio of openAccounts to totalAccounts
lcdf$openAccRatio <- ifelse(lcdf$total_acc>0, lcdf$open_acc/lcdf$total_acc, 0)
summary(lcdf$openAccRatio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3704 0.4815 0.5018 0.6154 1.0000
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0000 0.3704 0.4815 0.5017 0.6154 1.0000
ggplot(lcdf, aes( x = openAccRatio)) + geom_boxplot(aes(fill=loan_status)) +
xlab("Proportion of Open Account to Total Accounts ") + ylab(" Loan Status ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
#does LC-assigned loan grade vary by borrHistory?
lcdf %>% group_by(grade) %>% summarise(avgBorrHist=mean(borrHistory))
## # A tibble: 7 × 2
## grade avgBorrHist
## <chr> <dbl>
## 1 A 18.1
## 2 B 16.4
## 3 C 15.4
## 4 D 14.8
## 5 E 14.2
## 6 F 13.5
## 7 G 13.3
ggplot(lcdf, aes( x = borrHistory)) + geom_boxplot(aes(fill=grade)) +
xlab("Borrower History ") + ylab(" Loan Status ") + theme(plot.title = element_text(color="#993333", size=14, face="bold.italic"), axis.title.x = element_text(color="#993333", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold"))
lcdf %>% group_by(grade) %>% summarise(avgBorrHist=mean(borrHistory), minBorrHist=min(borrHistory), maxBorrHist = max(borrHistory), medianBorrHist=median(borrHistory))
## # A tibble: 7 × 5
## grade avgBorrHist minBorrHist maxBorrHist medianBorrHist
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 A 18.1 3.08 63.2 16.4
## 2 B 16.4 3.08 59.8 14.8
## 3 C 15.4 3.08 59.6 14.0
## 4 D 14.8 3.08 60.7 13.4
## 5 E 14.2 3.16 58.2 12.8
## 6 F 13.5 3.08 44.8 12.4
## 7 G 13.3 3.25 52 12
Yes, assigned loan grade varies, significantly with the borrower history. We can also check the min, max median in the below box plot
#glimpse(lcdf)
# there are a few character type variables - grade, sub_grade, verification_status,....
# We can convert all of these to factor
lcdf <- lcdf %>% mutate_if(is.character, as.factor)
#Checking the datatype after conversion
#glimpse(lcdf)
Concept of leakage - In statistics and machine learning, leakage (also known as data leakage or target leakage) is the use of information in the model training process which would not be expected to be available at prediction time, causing the predictive scores (metrics) to overestimate the model’s utility when run in a production environment.Reference - https://en.wikipedia.org/wiki/Leakage_(machine_learning)#:~:text=In%20statistics%20and%20machine%20learning,when%20run%20in%20a%20production
#Identified the variables you want to remove
varsToRemove = c('funded_amnt_inv', 'term', 'emp_title', 'pymnt_plan', 'earliest_cr_line', 'title', 'zip_code', 'addr_state', 'out_prncp', 'out_prncp_inv', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_credit_pull_d', 'policy_code', 'disbursement_method', 'debt_settlement_flag', 'settlement_term', 'application_type')
lcdf <- lcdf %>% select(-all_of(varsToRemove))
#Drop all the variables with names starting with "hardship" -- as they can cause leakage, unknown at the time when the loan was given.
#First checking before dropping
lcdf %>% select(starts_with("hardship"))
## # A tibble: 109,926 × 12
## hardship_flag hardship_type hardship_reason hardship_status hardship_amount
## <fct> <lgl> <lgl> <lgl> <lgl>
## 1 N NA NA NA NA
## 2 N NA NA NA NA
## 3 N NA NA NA NA
## 4 N NA NA NA NA
## 5 N NA NA NA NA
## 6 N NA NA NA NA
## 7 N NA NA NA NA
## 8 N NA NA NA NA
## 9 N NA NA NA NA
## 10 N NA NA NA NA
## # … with 109,916 more rows, and 7 more variables: hardship_start_date <lgl>,
## # hardship_end_date <lgl>, hardship_length <lgl>, hardship_dpd <lgl>,
## # hardship_loan_status <lgl>, hardship_payoff_balance_amount <lgl>,
## # hardship_last_payment_amount <lgl>
# Dropping
lcdf <- lcdf %>% select(-starts_with("hardship"))
#similarly, all variable starting with "settlement", these are happening after disbursement
lcdf %>% select(starts_with('settlement'))
## # A tibble: 109,926 × 4
## settlement_status settlement_date settlement_amount settlement_percentage
## <lgl> <lgl> <lgl> <lgl>
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## 7 NA NA NA NA
## 8 NA NA NA NA
## 9 NA NA NA NA
## 10 NA NA NA NA
## # … with 109,916 more rows
# 4 columns
#Dropping them
lcdf <- lcdf %>% select(-starts_with("settlement"))
# Additional Leakage variables - based on our understanding
varsToRemove2 <- c("last_pymnt_d", "last_pymnt_amnt", "issue_d",'next_pymnt_d', 'deferral_term', 'payment_plan_start_date', 'debt_settlement_flag_date' )
# last_pymnt_d, last_pymnt_amnt, next_pymnt_d, deferral_term, payment_plan_start_date, debt_settlement_flag_date
lcdf <- lcdf %>% select(-all_of(varsToRemove2))
Understanding the leakage is very important in the concept of Data Mining where we will be going ahead to predict models based on the training data. The models will be well trained if we use the leakage variable, however when we get unseen set of data the prediction will be poor as they wont be having values of these variables
Potential reasons for missing values in different variables? Are some of the missing values actually ‘zeros’ which are not recorded in the data? Is missing-ness informative in some way? Are there, for example, more/less defaults for cases where values on the attribute are missing ?
# Dropping columns with all n/a
lcdf %>% select_if(function(x){ all(is.na(x)) } ) # Checking what are those columns
## # A tibble: 109,926 × 19
## id member_id url desc annual_inc_joint dti_joint verification_status_j…
## <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA
## 7 NA NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA
## 9 NA NA NA NA NA NA NA
## 10 NA NA NA NA NA NA NA
## # … with 109,916 more rows, and 12 more variables: revol_bal_joint <lgl>,
## # sec_app_earliest_cr_line <lgl>, sec_app_inq_last_6mths <lgl>,
## # sec_app_mort_acc <lgl>, sec_app_open_acc <lgl>, sec_app_revol_util <lgl>,
## # sec_app_open_act_il <lgl>, sec_app_num_rev_accts <lgl>,
## # sec_app_chargeoff_within_12_mths <lgl>,
## # sec_app_collections_12_mths_ex_med <lgl>,
## # sec_app_mths_since_last_major_derog <lgl>, …
lcdf <- lcdf %>% select_if(function(x){ ! all(is.na(x)) } ) # Dropping
# Finding names of columns which has atleast 1 missing values
names(lcdf)[colSums(is.na(lcdf)) > 0]
## [1] "dti" "mths_since_last_delinq"
## [3] "mths_since_last_record" "revol_util"
## [5] "mths_since_last_major_derog" "tot_coll_amt"
## [7] "tot_cur_bal" "open_acc_6m"
## [9] "open_act_il" "open_il_12m"
## [11] "open_il_24m" "mths_since_rcnt_il"
## [13] "total_bal_il" "il_util"
## [15] "open_rv_12m" "open_rv_24m"
## [17] "max_bal_bc" "all_util"
## [19] "total_rev_hi_lim" "inq_fi"
## [21] "total_cu_tl" "inq_last_12m"
## [23] "acc_open_past_24mths" "avg_cur_bal"
## [25] "bc_open_to_buy" "bc_util"
## [27] "mo_sin_old_il_acct" "mo_sin_old_rev_tl_op"
## [29] "mo_sin_rcnt_rev_tl_op" "mo_sin_rcnt_tl"
## [31] "mort_acc" "mths_since_recent_bc"
## [33] "mths_since_recent_bc_dlq" "mths_since_recent_inq"
## [35] "mths_since_recent_revol_delinq" "num_accts_ever_120_pd"
## [37] "num_actv_bc_tl" "num_actv_rev_tl"
## [39] "num_bc_sats" "num_bc_tl"
## [41] "num_il_tl" "num_op_rev_tl"
## [43] "num_rev_accts" "num_rev_tl_bal_gt_0"
## [45] "num_sats" "num_tl_120dpd_2m"
## [47] "num_tl_30dpd" "num_tl_90g_dpd_24m"
## [49] "num_tl_op_past_12m" "pct_tl_nvr_dlq"
## [51] "percent_bc_gt_75" "tot_hi_cred_lim"
## [53] "total_bal_ex_mort" "total_bc_limit"
## [55] "total_il_high_credit_limit" "propSatisBankcardAccts"
# Finding proportion
options(scipen=999) # To not use scientific notation
colMeans(is.na(lcdf))[colMeans(is.na(lcdf))>0]
## dti mths_since_last_delinq
## 0.000009097029 0.505649254953
## mths_since_last_record revol_util
## 0.835034477740 0.000418463330
## mths_since_last_major_derog tot_coll_amt
## 0.733393373724 0.037243236359
## tot_cur_bal open_acc_6m
## 0.037243236359 0.974482833906
## open_act_il open_il_12m
## 0.974482833906 0.974482833906
## open_il_24m mths_since_rcnt_il
## 0.974482833906 0.975274275422
## total_bal_il il_util
## 0.974482833906 0.978094354384
## open_rv_12m open_rv_24m
## 0.974482833906 0.974482833906
## max_bal_bc all_util
## 0.974482833906 0.974482833906
## total_rev_hi_lim inq_fi
## 0.037243236359 0.974482833906
## total_cu_tl inq_last_12m
## 0.974482833906 0.974482833906
## acc_open_past_24mths avg_cur_bal
## 0.009879373397 0.037270527446
## bc_open_to_buy bc_util
## 0.019513127013 0.020140822008
## mo_sin_old_il_acct mo_sin_old_rev_tl_op
## 0.073558575769 0.037243236359
## mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl
## 0.037243236359 0.037243236359
## mort_acc mths_since_recent_bc
## 0.009879373397 0.018812655787
## mths_since_recent_bc_dlq mths_since_recent_inq
## 0.752924694795 0.116905918527
## mths_since_recent_revol_delinq num_accts_ever_120_pd
## 0.652256972873 0.037243236359
## num_actv_bc_tl num_actv_rev_tl
## 0.037243236359 0.037243236359
## num_bc_sats num_bc_tl
## 0.021532667431 0.037243236359
## num_il_tl num_op_rev_tl
## 0.037243236359 0.037243236359
## num_rev_accts num_rev_tl_bal_gt_0
## 0.037243236359 0.037243236359
## num_sats num_tl_120dpd_2m
## 0.021532667431 0.072103051143
## num_tl_30dpd num_tl_90g_dpd_24m
## 0.037243236359 0.037243236359
## num_tl_op_past_12m pct_tl_nvr_dlq
## 0.037243236359 0.037452468024
## percent_bc_gt_75 tot_hi_cred_lim
## 0.020195404181 0.037243236359
## total_bal_ex_mort total_bc_limit
## 0.009879373397 0.009879373397
## total_il_high_credit_limit propSatisBankcardAccts
## 0.037243236359 0.037243236359
# Finding the columns which have more than 60% missing values
names(lcdf)[colMeans(is.na(lcdf))>0.6]
## [1] "mths_since_last_record" "mths_since_last_major_derog"
## [3] "open_acc_6m" "open_act_il"
## [5] "open_il_12m" "open_il_24m"
## [7] "mths_since_rcnt_il" "total_bal_il"
## [9] "il_util" "open_rv_12m"
## [11] "open_rv_24m" "max_bal_bc"
## [13] "all_util" "inq_fi"
## [15] "total_cu_tl" "inq_last_12m"
## [17] "mths_since_recent_bc_dlq" "mths_since_recent_revol_delinq"
nm<-names(lcdf)[colMeans(is.na(lcdf))>0.6]
lcdf <- lcdf %>% select(-all_of(nm))
#Impute missing values for remaining variables which have missing values
# - first get the columns with missing values
colMeans(is.na(lcdf))[colMeans(is.na(lcdf))>0]
## dti mths_since_last_delinq
## 0.000009097029 0.505649254953
## revol_util tot_coll_amt
## 0.000418463330 0.037243236359
## tot_cur_bal total_rev_hi_lim
## 0.037243236359 0.037243236359
## acc_open_past_24mths avg_cur_bal
## 0.009879373397 0.037270527446
## bc_open_to_buy bc_util
## 0.019513127013 0.020140822008
## mo_sin_old_il_acct mo_sin_old_rev_tl_op
## 0.073558575769 0.037243236359
## mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl
## 0.037243236359 0.037243236359
## mort_acc mths_since_recent_bc
## 0.009879373397 0.018812655787
## mths_since_recent_inq num_accts_ever_120_pd
## 0.116905918527 0.037243236359
## num_actv_bc_tl num_actv_rev_tl
## 0.037243236359 0.037243236359
## num_bc_sats num_bc_tl
## 0.021532667431 0.037243236359
## num_il_tl num_op_rev_tl
## 0.037243236359 0.037243236359
## num_rev_accts num_rev_tl_bal_gt_0
## 0.037243236359 0.037243236359
## num_sats num_tl_120dpd_2m
## 0.021532667431 0.072103051143
## num_tl_30dpd num_tl_90g_dpd_24m
## 0.037243236359 0.037243236359
## num_tl_op_past_12m pct_tl_nvr_dlq
## 0.037243236359 0.037452468024
## percent_bc_gt_75 tot_hi_cred_lim
## 0.020195404181 0.037243236359
## total_bal_ex_mort total_bc_limit
## 0.009879373397 0.009879373397
## total_il_high_credit_limit propSatisBankcardAccts
## 0.037243236359 0.037243236359
nm<- names(lcdf)[colSums(is.na(lcdf))>0]
summary(lcdf[, nm])
## dti mths_since_last_delinq revol_util tot_coll_amt
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.: 11.58 1st Qu.: 16.00 1st Qu.: 36.70 1st Qu.: 0.0
## Median : 17.28 Median : 31.00 Median : 54.50 Median : 0.0
## Mean : 17.82 Mean : 34.19 Mean : 54.07 Mean : 224.1
## 3rd Qu.: 23.63 3rd Qu.: 50.00 3rd Qu.: 72.20 3rd Qu.: 0.0
## Max. :137.40 Max. :188.00 Max. :117.90 Max. :143558.0
## NA's :1 NA's :55584 NA's :46 NA's :4094
## tot_cur_bal total_rev_hi_lim acc_open_past_24mths avg_cur_bal
## Min. : 0 Min. : 0 Min. : 0.000 Min. : 0
## 1st Qu.: 25541 1st Qu.: 12800 1st Qu.: 2.000 1st Qu.: 2771
## Median : 64569 Median : 22000 Median : 4.000 Median : 6273
## Mean : 128436 Mean : 30504 Mean : 4.356 Mean : 12441
## 3rd Qu.: 190600 3rd Qu.: 37600 3rd Qu.: 6.000 3rd Qu.: 17107
## Max. :3370799 Max. :1046900 Max. :40.000 Max. :312125
## NA's :4094 NA's :4094 NA's :1086 NA's :4097
## bc_open_to_buy bc_util mo_sin_old_il_acct mo_sin_old_rev_tl_op
## Min. : 0 Min. : 0.00 Min. : 0.0 Min. : 5.0
## 1st Qu.: 1173 1st Qu.: 42.70 1st Qu.: 95.0 1st Qu.:115.0
## Median : 3878 Median : 66.50 Median :128.0 Median :164.0
## Mean : 9010 Mean : 62.68 Mean :124.8 Mean :181.7
## 3rd Qu.: 10600 3rd Qu.: 86.30 3rd Qu.:152.0 3rd Qu.:230.0
## Max. :278899 Max. :255.20 Max. :519.0 Max. :757.0
## NA's :2145 NA's :2214 NA's :8086 NA's :4094
## mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mort_acc mths_since_recent_bc
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4.00 1st Qu.: 3.000 1st Qu.: 0.000 1st Qu.: 6.00
## Median : 8.00 Median : 6.000 Median : 1.000 Median : 14.00
## Mean : 13.27 Mean : 8.263 Mean : 1.627 Mean : 24.53
## 3rd Qu.: 16.00 3rd Qu.: 10.000 3rd Qu.: 3.000 3rd Qu.: 30.00
## Max. :372.00 Max. :197.000 Max. :34.000 Max. :555.00
## NA's :4094 NA's :4094 NA's :1086 NA's :2068
## mths_since_recent_inq num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.000 1st Qu.: 3.000
## Median : 5.000 Median : 0.000 Median : 3.000 Median : 5.000
## Mean : 6.928 Mean : 0.498 Mean : 3.652 Mean : 5.676
## 3rd Qu.:10.000 3rd Qu.: 0.000 3rd Qu.: 5.000 3rd Qu.: 7.000
## Max. :25.000 Max. :35.000 Max. :30.000 Max. :38.000
## NA's :12851 NA's :4094 NA's :4094 NA's :4094
## num_bc_sats num_bc_tl num_il_tl num_op_rev_tl
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 5.000 1st Qu.: 3.000 1st Qu.: 5.000
## Median : 4.000 Median : 7.000 Median : 6.000 Median : 7.000
## Mean : 4.652 Mean : 8.324 Mean : 8.076 Mean : 8.183
## 3rd Qu.: 6.000 3rd Qu.:11.000 3rd Qu.:11.000 3rd Qu.:10.000
## Max. :46.000 Max. :60.000 Max. :97.000 Max. :58.000
## NA's :2367 NA's :4094 NA's :4094 NA's :4094
## num_rev_accts num_rev_tl_bal_gt_0 num_sats num_tl_120dpd_2m
## Min. : 1.00 Min. : 0.000 Min. : 0.00 Min. :0.000
## 1st Qu.: 9.00 1st Qu.: 3.000 1st Qu.: 8.00 1st Qu.:0.000
## Median :13.00 Median : 5.000 Median :10.00 Median :0.000
## Mean :14.78 Mean : 5.642 Mean :11.38 Mean :0.001
## 3rd Qu.:19.00 3rd Qu.: 7.000 3rd Qu.:14.00 3rd Qu.:0.000
## Max. :92.00 Max. :38.000 Max. :62.00 Max. :2.000
## NA's :4094 NA's :4094 NA's :2367 NA's :7926
## num_tl_30dpd num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq
## Min. :0.000 Min. : 0.000 Min. : 0.000 Min. : 14.80
## 1st Qu.:0.000 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.: 91.00
## Median :0.000 Median : 0.000 Median : 2.000 Median : 97.80
## Mean :0.003 Mean : 0.093 Mean : 2.036 Mean : 94.07
## 3rd Qu.:0.000 3rd Qu.: 0.000 3rd Qu.: 3.000 3rd Qu.:100.00
## Max. :4.000 Max. :20.000 Max. :25.000 Max. :100.00
## NA's :4094 NA's :4094 NA's :4094 NA's :4117
## percent_bc_gt_75 tot_hi_cred_lim total_bal_ex_mort total_bc_limit
## Min. : 0.00 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 16.70 1st Qu.: 43057 1st Qu.: 18781 1st Qu.: 7000
## Median : 50.00 Median : 93220 Median : 33795 Median : 13600
## Mean : 48.31 Mean : 158917 Mean : 45639 Mean : 20163
## 3rd Qu.: 75.00 3rd Qu.: 228957 3rd Qu.: 57068 3rd Qu.: 26000
## Max. :100.00 Max. :8700253 Max. :1043860 Max. :456200
## NA's :2220 NA's :4094 NA's :1086 NA's :1086
## total_il_high_credit_limit propSatisBankcardAccts
## Min. : 0 Min. :0.000
## 1st Qu.: 12000 1st Qu.:0.429
## Median : 28170 Median :0.600
## Mean : 38108 Mean :0.614
## 3rd Qu.: 51194 3rd Qu.:0.800
## Max. :975560 Max. :1.000
## NA's :4094 NA's :4094
# Replacing values - adding median values
lcdf<- lcdf %>% replace_na(list(mths_since_last_delinq=median(lcdf$mths_since_last_delinq, na.rm=TRUE), bc_open_to_buy=median(lcdf$bc_open_to_buy, na.rm=TRUE), mo_sin_old_il_acct=median(lcdf$mo_sin_old_il_acct,na.rm=TRUE), mths_since_recent_bc=median(lcdf$mths_since_recent_bc, na.rm=TRUE), mths_since_recent_inq=5, num_tl_120dpd_2m = median(lcdf$num_tl_120dpd_2m, na.rm=TRUE),percent_bc_gt_75 = median(lcdf$percent_bc_gt_75, na.rm=TRUE), bc_util=median(lcdf$bc_util, na.rm=TRUE) ))
lcdf<- lcdf %>% mutate_if(is.numeric, ~ifelse(is.na(.x), median(.x, na.rm = TRUE), .x))
dim(lcdf)
## [1] 109926 69
Yes, some columns have same percentage of missing values. This could be because they are dependent columns. Information source of a column is also the source of other columns could be the reason. These missing values can be because of the following - 1. Missing Completely at Random 2. Missing at Random 3. Missing Not At Random. We could use various techniques taught in class to impute these missing values. 1. Imputing values 2. Leaving those rows. However approach for each column can be different. We could use various techniques taught in class to impute these missing values. 1. Imputing values 2. Leaving those rows. However approach for each column can be different. If they do not relate well to larger values, than we should not assume that missings are for values higher than the max.We will remove columns with more than 60% missing values, this is taken as a trial and test way - However when it comes to removing columns with NA approach could be different in each case. This could also mean loss of very important variable. We can tune our model based on the results
which variables are individually predictive of the outcome ? Considering a single variable model to predict loan_status, what could be a measure of performance? AUC? For a univariate model with a variable, say, x1, what should we consider as the model ‘score’ for predicting loan_status? Can we take the values of x1 as the score for a model y_hat=f(x1) ? Using this approximate approach, we can then compute the AUC for each variable. AUC of a classifier is equivalent to the probability that the classifier will rank a randomly chosen positive instance higher than a randomly chosen negative instance.Reference - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7695228/
#We will use the function auc(response, prediction) which returns the AUC value for the specified predictor variable, and considering the response variable as the dependent.
aucAll<- sapply(lcdf %>% mutate_if(is.factor, as.numeric) %>% select_if(is.numeric), auc, response=lcdf$loan_status)
library(broom)
tidy(aucAll[aucAll > 0.5])
## # A tibble: 46 × 2
## names x
## <chr> <dbl>
## 1 loan_amnt 0.515
## 2 funded_amnt 0.515
## 3 int_rate 0.656
## 4 installment 0.501
## 5 grade 0.652
## 6 sub_grade 0.663
## 7 emp_length 0.529
## 8 home_ownership 0.552
## 9 annual_inc 0.575
## 10 loan_status 1
## # … with 36 more rows
tidy(aucAll) %>% arrange(desc(aucAll))
## # A tibble: 69 × 2
## names x
## <chr> <dbl>
## 1 loan_status 1
## 2 actualReturn 0.987
## 3 annRet 0.968
## 4 total_pymnt 0.752
## 5 actualTerm 0.679
## 6 sub_grade 0.663
## 7 int_rate 0.656
## 8 grade 0.652
## 9 acc_open_past_24mths 0.579
## 10 annual_inc 0.575
## # … with 59 more rows
Example, actualReturn, actualTerm are in the data - we have kept these because they will be useful for evaluating performance of models. High AUC effect on model It tells how much the model is capable of distinguishing between classes. Higher the AUC, the better the model is at predicting 0 classes as 0 and 1 classes as 1. By analogy, the Higher the AUC, the better the model is at distinguishing between patients with the disease and no disease. Will need to make sure these are not included in building the models
Defining train and test Train Data set: Used to fit the machine learning model. Test Data set: Used to evaluate the fit machine learning model.While there are no set rules to define the proportion of test and train data - the split should be enough to train the model well to predict well on the unseen data. So we could try various splits and check how the model performs. Our aim throughout is good generalization. Reference - https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/
# First trial with a 50% split
## set the seed to make your partition reproducible
set.seed(123)
TRNPROP = 0.5 #proportion of examples in the training sample
nr<-nrow(lcdf)
nr
## [1] 109926
trnIndex<- sample(1:nr, size = round(TRNPROP * nr), replace=FALSE)
lcdfTrn <- lcdf[trnIndex, ] # Train data
lcdfTst <- lcdf[-trnIndex, ] # Test data
# Variables for the modelling
# No we dont want to use all the variable - we will remove the leakage variables we found in the AUC combined table
# Variables like actualTerm, actualReturn, annRet, total_pymnt will be useful in performance assessment, but should not be used in building the model.
varsOmit <- c('actualTerm', 'actualReturn', 'annRet', 'total_pymnt')
# Checking if target variable is factor
class(lcdf$loan_status)
## [1] "factor"
# Converting it to factor where Fully paid is target
lcdf$loan_status <- factor(lcdf$loan_status, levels=c("Fully Paid", "Charged Off"))
# Decision Tree - Model
lcDT1 <- rpart(loan_status ~., data=lcdfTrn %>% select(-all_of(varsOmit)), method="class", parms = list(split = "information"), control = rpart.control(minsplit = 30))
printcp(lcDT1)
##
## Classification tree:
## rpart(formula = loan_status ~ ., data = lcdfTrn %>% select(-all_of(varsOmit)),
## method = "class", parms = list(split = "information"), control = rpart.control(minsplit = 30))
##
## Variables actually used in tree construction:
## character(0)
##
## Root node error: 7679/54963 = 0.13971
##
## n= 54963
##
## CP nsplit rel error xerror xstd
## 1 0 0 1 0 0
The complexity parameter (CP)is not the error in that particular node. It is the amount by which splitting that node improved the relative error. So in your example, splitting the original root node dropped the relative error from 1.0 to 0.5, so the CP of the root node is 0.5. The CP of the next node is only 0.01 (which is the default limit for deciding when to consider splits). So splitting that node only resulted in an improvement of 0.01, so the tree building stopped there.
lcDT1 <- rpart(loan_status ~., data=lcdfTrn %>% select(-all_of(varsOmit)), method="class", parms = list(split = "information"), control = rpart.control(cp=0.0001, minsplit = 50))
#check for performance with different cp levels
printcp(lcDT1)
##
## Classification tree:
## rpart(formula = loan_status ~ ., data = lcdfTrn %>% select(-all_of(varsOmit)),
## method = "class", parms = list(split = "information"), control = rpart.control(cp = 0.0001,
## minsplit = 50))
##
## Variables actually used in tree construction:
## [1] acc_open_past_24mths annual_inc
## [3] avg_cur_bal bc_open_to_buy
## [5] bc_util borrHistory
## [7] delinq_2yrs dti
## [9] emp_length home_ownership
## [11] initial_list_status inq_last_6mths
## [13] installment int_rate
## [15] loan_amnt mo_sin_old_il_acct
## [17] mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op
## [19] mo_sin_rcnt_tl mths_since_last_delinq
## [21] mths_since_recent_bc mths_since_recent_inq
## [23] num_actv_bc_tl num_actv_rev_tl
## [25] num_bc_sats num_bc_tl
## [27] num_il_tl num_op_rev_tl
## [29] num_rev_accts num_rev_tl_bal_gt_0
## [31] num_tl_op_past_12m open_acc
## [33] openAccRatio pct_tl_nvr_dlq
## [35] percent_bc_gt_75 propSatisBankcardAccts
## [37] purpose revol_bal
## [39] revol_util sub_grade
## [41] tot_coll_amt tot_cur_bal
## [43] tot_hi_cred_lim total_acc
## [45] total_bal_ex_mort total_bc_limit
## [47] total_il_high_credit_limit total_rev_hi_lim
## [49] verification_status
##
## Root node error: 7679/54963 = 0.13971
##
## n= 54963
##
## CP nsplit rel error xerror xstd
## 1 0.00035347 0 1.00000 1.0000 0.010584
## 2 0.00032556 39 0.97877 1.0203 0.010674
## 3 0.00030386 41 0.97812 1.0223 0.010682
## 4 0.00026045 44 0.97721 1.0359 0.010742
## 5 0.00021704 51 0.97539 1.0546 0.010821
## 6 0.00019534 91 0.95950 1.0779 0.010919
## 7 0.00018810 93 0.95911 1.0826 0.010939
## 8 0.00018232 151 0.93866 1.0849 0.010948
## 9 0.00016278 157 0.93710 1.0918 0.010977
## 10 0.00015627 161 0.93645 1.0962 0.010995
## 11 0.00014650 188 0.92890 1.0983 0.011004
## 12 0.00014108 202 0.92655 1.1250 0.011112
## 13 0.00013023 214 0.92486 1.1271 0.011120
## 14 0.00011576 247 0.91900 1.1357 0.011155
## 15 0.00011162 256 0.91796 1.1400 0.011172
## 16 0.00010418 279 0.91431 1.1456 0.011194
## 17 0.00010129 284 0.91379 1.1518 0.011219
## 18 0.00010000 293 0.91288 1.1520 0.011219
lcDT1$variable.importance %>% head(10)
## sub_grade int_rate grade bc_open_to_buy
## 943.6137 787.3146 755.7075 287.1184
## total_bc_limit total_rev_hi_lim emp_length tot_cur_bal
## 264.6723 243.5815 182.2893 164.2640
## tot_hi_cred_lim installment
## 145.2483 139.2110
We will now prune the tree to see the performance, this pruning will be based on different cp values
# We will change values of cp to see different models
lcDT1p1<- prune.rpart(lcDT1, cp=0.0015)
printcp(lcDT1p1)
##
## Classification tree:
## rpart(formula = loan_status ~ ., data = lcdfTrn %>% select(-all_of(varsOmit)),
## method = "class", parms = list(split = "information"), control = rpart.control(cp = 0.0001,
## minsplit = 50))
##
## Variables actually used in tree construction:
## character(0)
##
## Root node error: 7679/54963 = 0.13971
##
## n= 54963
##
## CP nsplit rel error xerror xstd
## 1 0.0015 0 1 1 0.010584
lcDT1p1$variable.importance %>%head(10)
## logical(0)
lcDT1p2<- prune.rpart(lcDT1, cp=0.0002)
printcp(lcDT1p2)
##
## Classification tree:
## rpart(formula = loan_status ~ ., data = lcdfTrn %>% select(-all_of(varsOmit)),
## method = "class", parms = list(split = "information"), control = rpart.control(cp = 0.0001,
## minsplit = 50))
##
## Variables actually used in tree construction:
## [1] acc_open_past_24mths annual_inc
## [3] avg_cur_bal bc_open_to_buy
## [5] bc_util borrHistory
## [7] delinq_2yrs dti
## [9] emp_length inq_last_6mths
## [11] installment loan_amnt
## [13] mo_sin_old_il_acct mo_sin_old_rev_tl_op
## [15] mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl
## [17] mths_since_last_delinq mths_since_recent_bc
## [19] num_actv_bc_tl num_actv_rev_tl
## [21] num_il_tl num_op_rev_tl
## [23] num_rev_accts openAccRatio
## [25] pct_tl_nvr_dlq propSatisBankcardAccts
## [27] purpose revol_bal
## [29] revol_util sub_grade
## [31] tot_coll_amt tot_cur_bal
## [33] tot_hi_cred_lim total_bal_ex_mort
## [35] total_bc_limit total_il_high_credit_limit
## [37] total_rev_hi_lim
##
## Root node error: 7679/54963 = 0.13971
##
## n= 54963
##
## CP nsplit rel error xerror xstd
## 1 0.00035347 0 1.00000 1.0000 0.010584
## 2 0.00032556 39 0.97877 1.0203 0.010674
## 3 0.00030386 41 0.97812 1.0223 0.010682
## 4 0.00026045 44 0.97721 1.0359 0.010742
## 5 0.00021704 51 0.97539 1.0546 0.010821
## 6 0.00020000 91 0.95950 1.0779 0.010919
lcDT1p2$variable.importance %>%head(10)
## sub_grade int_rate grade bc_open_to_buy
## 900.52500 745.14474 738.80228 251.47372
## total_bc_limit total_rev_hi_lim emp_length installment
## 224.83859 199.36185 77.87732 75.24516
## loan_amnt funded_amnt
## 69.51392 68.38376
lcDT1p3<- prune.rpart(lcDT1, cp=0.0003)
printcp(lcDT1p3)
##
## Classification tree:
## rpart(formula = loan_status ~ ., data = lcdfTrn %>% select(-all_of(varsOmit)),
## method = "class", parms = list(split = "information"), control = rpart.control(cp = 0.0001,
## minsplit = 50))
##
## Variables actually used in tree construction:
## [1] acc_open_past_24mths annual_inc
## [3] bc_util delinq_2yrs
## [5] dti emp_length
## [7] installment loan_amnt
## [9] mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op
## [11] mths_since_last_delinq mths_since_recent_bc
## [13] num_actv_rev_tl num_il_tl
## [15] num_op_rev_tl pct_tl_nvr_dlq
## [17] propSatisBankcardAccts purpose
## [19] revol_util sub_grade
## [21] tot_coll_amt tot_cur_bal
## [23] total_bal_ex_mort total_bc_limit
## [25] total_il_high_credit_limit
##
## Root node error: 7679/54963 = 0.13971
##
## n= 54963
##
## CP nsplit rel error xerror xstd
## 1 0.00035347 0 1.00000 1.0000 0.010584
## 2 0.00032556 39 0.97877 1.0203 0.010674
## 3 0.00030386 41 0.97812 1.0223 0.010682
## 4 0.00030000 44 0.97721 1.0359 0.010742
lcDT1p3$variable.importance %>%head(10)
## sub_grade int_rate grade bc_open_to_buy
## 881.69367 738.71905 735.75677 241.25023
## total_bc_limit total_rev_hi_lim installment funded_amnt
## 212.95990 188.25359 57.12921 50.94669
## loan_amnt emp_length
## 50.94669 41.38985
Using the ‘prior’ parameters to account for unbalanced training data. The ‘prior’ parameter can be used to specify the distribution of examples across classes. By default, the prior is taken from the dataset
#Training the model considering a more balanced training dataset?
lcDT1b <- rpart(loan_status ~., data=lcdfTrn %>% select(-all_of(varsOmit)),
method="class", parms = list(split = "gini", prior=c(0.5, 0.5)),
control = rpart.control(cp=0.0, minsplit = 20, minbucket = 10, maxdepth = 20, xval=10) )
printcp(lcDT1b)
##
## Classification tree:
## rpart(formula = loan_status ~ ., data = lcdfTrn %>% select(-all_of(varsOmit)),
## method = "class", parms = list(split = "gini", prior = c(0.5,
## 0.5)), control = rpart.control(cp = 0, minsplit = 20,
## minbucket = 10, maxdepth = 20, xval = 10))
##
## Variables actually used in tree construction:
## [1] acc_open_past_24mths annual_inc
## [3] avg_cur_bal bc_open_to_buy
## [5] bc_util borrHistory
## [7] collections_12_mths_ex_med delinq_2yrs
## [9] dti emp_length
## [11] funded_amnt home_ownership
## [13] initial_list_status inq_last_6mths
## [15] installment int_rate
## [17] loan_amnt mo_sin_old_il_acct
## [19] mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op
## [21] mo_sin_rcnt_tl mort_acc
## [23] mths_since_last_delinq mths_since_recent_bc
## [25] mths_since_recent_inq num_accts_ever_120_pd
## [27] num_actv_bc_tl num_actv_rev_tl
## [29] num_bc_sats num_bc_tl
## [31] num_il_tl num_op_rev_tl
## [33] num_rev_accts num_rev_tl_bal_gt_0
## [35] num_sats num_tl_90g_dpd_24m
## [37] num_tl_op_past_12m open_acc
## [39] openAccRatio pct_tl_nvr_dlq
## [41] percent_bc_gt_75 propSatisBankcardAccts
## [43] pub_rec pub_rec_bankruptcies
## [45] purpose revol_bal
## [47] revol_util sub_grade
## [49] tax_liens tot_coll_amt
## [51] tot_cur_bal tot_hi_cred_lim
## [53] total_acc total_bal_ex_mort
## [55] total_bc_limit total_il_high_credit_limit
## [57] total_rev_hi_lim verification_status
##
## Root node error: 27481/54963 = 0.5
##
## n= 54963
##
## CP nsplit rel error xerror xstd
## 1 0.235386460815600301321737 0 1.00000 1.01220 0.0083281
## 2 0.003700364846366237334824 1 0.76461 0.76487 0.0062163
## 3 0.003359844974068917390631 4 0.75351 0.76295 0.0064583
## 4 0.002561453563205112315265 5 0.75015 0.75930 0.0066369
## 5 0.002033994870681306906673 6 0.74759 0.75670 0.0068260
## 6 0.001775474921584736451297 7 0.74556 0.75754 0.0067765
## 7 0.001507205426643486992008 8 0.74378 0.75742 0.0067661
## 8 0.001399342124882560706514 9 0.74227 0.75746 0.0067450
## 9 0.001301098925843511113290 11 0.73948 0.75712 0.0066647
## 10 0.001221504257832313001045 12 0.73817 0.75907 0.0066573
## 11 0.001061020490581943750280 16 0.73300 0.76274 0.0066928
## 12 0.001038340403002599833873 28 0.71552 0.76448 0.0066397
## 13 0.000951793216340905624991 29 0.71448 0.76613 0.0066174
## 14 0.000907220027827737975426 35 0.70871 0.76447 0.0066226
## 15 0.000888249725065561436879 39 0.70467 0.76547 0.0066584
## 16 0.000875879644511507512300 40 0.70378 0.76626 0.0066878
## 17 0.000812562678701050943898 42 0.70203 0.76704 0.0066864
## 18 0.000782505710176804153942 44 0.70040 0.76637 0.0067143
## 19 0.000782376267054007588572 45 0.69962 0.76690 0.0067299
## 20 0.000772508294522521155555 46 0.69884 0.76737 0.0067381
## 21 0.000766867879299389337659 47 0.69806 0.76722 0.0067360
## 22 0.000758452240244956153205 48 0.69730 0.76705 0.0067551
## 23 0.000753025727487152489979 51 0.69502 0.76736 0.0067665
## 24 0.000733543160451779704435 53 0.69352 0.76755 0.0067790
## 25 0.000697910498265797915596 54 0.69278 0.76689 0.0067583
## 26 0.000679711346022409887424 55 0.69208 0.76724 0.0067885
## 27 0.000670479572668923216525 57 0.69072 0.76665 0.0067909
## 28 0.000659969892741445489023 59 0.68938 0.76665 0.0067826
## 29 0.000655612892310295826415 61 0.68806 0.76555 0.0067695
## 30 0.000642075895774777034852 62 0.68741 0.76576 0.0067778
## 31 0.000641898347548189784559 68 0.68341 0.76707 0.0067959
## 32 0.000636642589548120372070 71 0.68148 0.76716 0.0067990
## 33 0.000625491202224650265883 72 0.68085 0.76819 0.0068204
## 34 0.000624466673678261783437 74 0.67959 0.76816 0.0068204
## 35 0.000613315286354792436191 75 0.67897 0.76727 0.0068125
## 36 0.000613315286354791677249 76 0.67836 0.76790 0.0068310
## 37 0.000602805606427315467631 77 0.67774 0.76826 0.0068381
## 38 0.000601522191635333854445 79 0.67654 0.76781 0.0068404
## 39 0.000594409705154014017699 82 0.67466 0.76818 0.0068391
## 40 0.000592166483377041716861 84 0.67347 0.76811 0.0068298
## 41 0.000592166483377040632659 85 0.67288 0.76811 0.0068298
## 42 0.000590947790146457031107 86 0.67228 0.76803 0.0068309
## 43 0.000588704568369482887126 88 0.67110 0.76803 0.0068309
## 44 0.000586078525442111880443 89 0.67051 0.76873 0.0068451
## 45 0.000572042208945679046296 91 0.66934 0.77043 0.0068525
## 46 0.000571017680399287528084 92 0.66877 0.77010 0.0068496
## 47 0.000568839180183713997822 93 0.66820 0.76994 0.0068518
## 48 0.000549868877421538001377 94 0.66763 0.77227 0.0068899
## 49 0.000548671758044752869675 95 0.66708 0.77466 0.0069359
## 50 0.000546997994204451317389 98 0.66543 0.77488 0.0069368
## 51 0.000538652768536671184597 106 0.66021 0.77523 0.0069407
## 52 0.000528720074443787498887 108 0.65913 0.77496 0.0069552
## 53 0.000528720074443786414685 109 0.65860 0.77454 0.0069564
## 54 0.000519747187335892982946 111 0.65754 0.77422 0.0069566
## 55 0.000516159501350866179586 112 0.65702 0.77548 0.0069733
## 56 0.000512399224535445003796 115 0.65548 0.77499 0.0069735
## 57 0.000507571271466035370094 120 0.65251 0.77518 0.0069653
## 58 0.000507571271466034936413 121 0.65200 0.77560 0.0069712
## 59 0.000503084827912088274754 122 0.65149 0.77585 0.0069619
## 60 0.000503084827912087624233 123 0.65099 0.77562 0.0069630
## 61 0.000495842898307973282100 124 0.65049 0.77572 0.0069630
## 62 0.000491635078780755280410 128 0.64828 0.77614 0.0069668
## 63 0.000486422468488283837613 132 0.64631 0.77676 0.0069757
## 64 0.000486422468488283349722 133 0.64582 0.77734 0.0069703
## 65 0.000484243968272708789468 134 0.64533 0.77738 0.0069713
## 66 0.000483602260876717386564 135 0.64485 0.77738 0.0069713
## 67 0.000483089996603522711660 137 0.64388 0.77738 0.0069713
## 68 0.000478026567214983038202 139 0.64292 0.77867 0.0069900
## 69 0.000472515595114646647264 141 0.64196 0.77811 0.0069791
## 70 0.000465273665510533335123 143 0.64102 0.77909 0.0070192
## 71 0.000461941193625771667070 144 0.64055 0.77866 0.0070244
## 72 0.000451778924875890889896 145 0.64009 0.77890 0.0070334
## 73 0.000448952815602190460562 162 0.63100 0.78222 0.0070498
## 74 0.000446079591392457570475 166 0.62910 0.78205 0.0070459
## 75 0.000440792390648018508285 170 0.62731 0.78199 0.0070449
## 76 0.000430217989159144015981 171 0.62687 0.78165 0.0070330
## 77 0.000423488323828224457173 173 0.62601 0.78278 0.0070465
## 78 0.000422976059555029402798 175 0.62516 0.78273 0.0070466
## 79 0.000422976059555029294378 177 0.62432 0.78268 0.0070486
## 80 0.000422976059555029185958 178 0.62389 0.78268 0.0070486
## 81 0.000420412902116392599768 180 0.62305 0.78319 0.0070574
## 82 0.000419643587670268276846 184 0.62130 0.78298 0.0070575
## 83 0.000410286043339312981801 185 0.62088 0.78403 0.0070851
## 84 0.000409581450454587266824 190 0.61865 0.78389 0.0070821
## 85 0.000407402950239012164469 192 0.61783 0.78413 0.0070900
## 86 0.000406248978569826466132 194 0.61702 0.78447 0.0070768
## 87 0.000400096299073499214391 196 0.61620 0.78426 0.0070729
## 88 0.000394008341138570082705 201 0.61406 0.78307 0.0070595
## 89 0.000394008341138569974284 202 0.61367 0.78378 0.0070692
## 90 0.000393496076865375190960 203 0.61328 0.78378 0.0070692
## 91 0.000391445183699933117044 206 0.61210 0.78437 0.0070739
## 92 0.000391252855088402076971 209 0.61092 0.78435 0.0070729
## 93 0.000389586619146021486890 212 0.60963 0.78481 0.0070777
## 94 0.000389586619146021270049 214 0.60885 0.78484 0.0070796
## 95 0.000388432647476835788553 217 0.60738 0.78484 0.0070796
## 96 0.000387920383203640896808 219 0.60661 0.78500 0.0070836
## 97 0.000387455283046998799103 221 0.60583 0.78504 0.0070846
## 98 0.000386189425699863325084 241 0.59678 0.78504 0.0070846
## 99 0.000383433939649694451989 247 0.59390 0.78558 0.0070853
## 100 0.000380678453599526554676 249 0.59314 0.78609 0.0070870
## 101 0.000377345981714765374513 250 0.59276 0.78690 0.0070816
## 102 0.000377302834007166157989 251 0.59238 0.78765 0.0070721
## 103 0.000375615024210986664377 256 0.58974 0.78782 0.0070721
## 104 0.000370681037945243556288 258 0.58899 0.78893 0.0070885
## 105 0.000369014802002862749367 259 0.58861 0.78902 0.0071015
## 106 0.000367283844499084364491 261 0.58788 0.78869 0.0071047
## 107 0.000362317497452641762695 263 0.58714 0.78910 0.0071004
## 108 0.000360041914894969480258 267 0.58569 0.78997 0.0071269
## 109 0.000359529650621775022194 269 0.58497 0.79035 0.0071267
## 110 0.000356709443010208733776 270 0.58461 0.79061 0.0071326
## 111 0.000356197178737013842031 272 0.58390 0.79039 0.0071317
## 112 0.000351915640892345006965 273 0.58354 0.79074 0.0071285
## 113 0.000348955249132899066218 278 0.58178 0.79099 0.0071453
## 114 0.000348955249132898957798 280 0.58109 0.79179 0.0071508
## 115 0.000344840885704267318713 283 0.57992 0.79218 0.0071476
## 116 0.000338380847644023327082 288 0.57819 0.79251 0.0071435
## 117 0.000338380847644023164452 289 0.57785 0.79535 0.0071826
## 118 0.000337226875974837628745 290 0.57751 0.79535 0.0071826
## 119 0.000336714611701642845421 291 0.57718 0.79498 0.0071798
## 120 0.000335560640032457092874 293 0.57650 0.79572 0.0071864
## 121 0.000335048375759262797441 295 0.57583 0.79539 0.0071826
## 122 0.000335048375759262526390 296 0.57550 0.79568 0.0071874
## 123 0.000335048375759262255340 297 0.57516 0.79568 0.0071874
## 124 0.000333894404090076665423 298 0.57483 0.79574 0.0071844
## 125 0.000332869875543687207195 299 0.57449 0.79561 0.0071844
## 126 0.000332740432420890858665 300 0.57416 0.79561 0.0071844
## 127 0.000331715903874501617278 304 0.57267 0.79553 0.0071786
## 128 0.000331715903874501346228 305 0.57234 0.79620 0.0071831
## 129 0.000329984946370723015562 306 0.57201 0.79594 0.0071803
## 130 0.000328895696262935328859 308 0.57135 0.79622 0.0071762
## 131 0.000326075488651369311491 310 0.57069 0.79709 0.0072014
## 132 0.000324473974270386733045 313 0.56960 0.79748 0.0072042
## 133 0.000321076780824227541247 317 0.56830 0.79844 0.0072076
## 134 0.000319410544881847059586 321 0.56669 0.79927 0.0072140
## 135 0.000318727525850920464980 322 0.56637 0.80128 0.0072375
## 136 0.000317232044666272119861 325 0.56542 0.80183 0.0072362
## 137 0.000317232044666271957231 327 0.56478 0.80183 0.0072362
## 138 0.000317232044666271903021 328 0.56446 0.80183 0.0072362
## 139 0.000316078072997086042054 333 0.56288 0.80211 0.0072439
## 140 0.000314241082296974237002 334 0.56256 0.80265 0.0072406
## 141 0.000300141880311806331840 338 0.56122 0.80307 0.0072335
## 142 0.000298838727738688593114 355 0.55435 0.80485 0.0072717
## 143 0.000298326463465493972420 360 0.55269 0.80485 0.0072717
## 144 0.000296083241688520587380 362 0.55209 0.80474 0.0072757
## 145 0.000296083241688520424750 365 0.55120 0.80511 0.0072803
## 146 0.000296083241688520316329 367 0.55061 0.80511 0.0072803
## 147 0.000292750769803759678268 368 0.55031 0.80507 0.0072804
## 148 0.000292750769803759461427 369 0.55002 0.80570 0.0072966
## 149 0.000292750769803759407217 372 0.54914 0.80570 0.0072966
## 150 0.000291885291051870079254 373 0.54885 0.80574 0.0072966
## 151 0.000286085826034237317942 377 0.54768 0.80855 0.0073067
## 152 0.000285508840199644794034 378 0.54740 0.80849 0.0073018
## 153 0.000285508840199644522984 380 0.54682 0.80849 0.0073018
## 154 0.000285508840199644251933 382 0.54625 0.80849 0.0073018
## 155 0.000283631521474953288663 387 0.54439 0.80840 0.0072999
## 156 0.000282197330144462314207 396 0.54129 0.80875 0.0073036
## 157 0.000279388521484016873371 413 0.53571 0.80939 0.0073353
## 158 0.000276023688818556524761 420 0.53354 0.80926 0.0073354
## 159 0.000274934438710769271739 423 0.53269 0.81102 0.0073479
## 160 0.000274934438710769000688 424 0.53242 0.81134 0.0073468
## 161 0.000274934438710768946478 436 0.52912 0.81134 0.0073468
## 162 0.000274934438710768892268 438 0.52857 0.81134 0.0073468
## 163 0.000274934438710768729638 439 0.52829 0.81134 0.0073468
## 164 0.000273780467041583302351 445 0.52639 0.81173 0.0073514
## 165 0.000273268202768388573237 446 0.52612 0.81195 0.0073599
## 166 0.000271601966826008091576 448 0.52557 0.81237 0.0073578
## 167 0.000271601966826008037366 450 0.52503 0.81237 0.0073578
## 168 0.000271601966826007983156 451 0.52475 0.81237 0.0073578
## 169 0.000271601966826007820525 452 0.52448 0.81237 0.0073578
## 170 0.000268269494941246911413 453 0.52421 0.81261 0.0073663
## 171 0.000266314766081570222008 454 0.52394 0.81260 0.0073557
## 172 0.000265449287329680839835 458 0.52288 0.81331 0.0073543
## 173 0.000264360037221893207342 460 0.52235 0.81344 0.0073542
## 174 0.000262693801279512725681 462 0.52182 0.81375 0.0073589
## 175 0.000262116815444919930723 464 0.52129 0.81435 0.0073489
## 176 0.000261027565337132569281 468 0.52024 0.81460 0.0073516
## 177 0.000260962843775734286596 470 0.51972 0.81474 0.0073516
## 178 0.000259328968614052864962 473 0.51880 0.81466 0.0073506
## 179 0.000258239718506265069839 490 0.51205 0.81559 0.0073627
## 180 0.000255622626433129652729 494 0.51102 0.81663 0.0074053
## 181 0.000254412348535587682242 497 0.51025 0.81652 0.0073987
## 182 0.000253785635733017685047 506 0.50796 0.81666 0.0074015
## 183 0.000253785635733017576627 507 0.50771 0.81699 0.0074013
## 184 0.000253785635733017522417 511 0.50670 0.81699 0.0074013
## 185 0.000253785635733017522417 514 0.50593 0.81699 0.0074013
## 186 0.000253785635733017468207 518 0.50492 0.81699 0.0074013
## 187 0.000253785635733017197156 524 0.50338 0.81699 0.0074013
## 188 0.000253208649898424673248 525 0.50312 0.81733 0.0074078
## 189 0.000252696385625229889924 527 0.50261 0.81733 0.0074078
## 190 0.000251542413956044191587 529 0.50211 0.81738 0.0074087
## 191 0.000250453163848256559094 531 0.50161 0.81769 0.0074114
## 192 0.000250453163848256016993 533 0.50111 0.81768 0.0074153
## 193 0.000247462201478958730445 549 0.49647 0.81831 0.0074197
## 194 0.000247120691963495378932 552 0.49573 0.81887 0.0074165
## 195 0.000245966720294309843225 553 0.49548 0.81925 0.0074354
## 196 0.000243788220078734713765 554 0.49523 0.81919 0.0074297
## 197 0.000243435005600039898075 555 0.49499 0.81942 0.0074286
## 198 0.000243211234244141810386 564 0.49238 0.81942 0.0074286
## 199 0.000243211234244141674861 569 0.49111 0.81942 0.0074286
## 200 0.000242634248409548934113 571 0.49062 0.82000 0.0074378
## 201 0.000242634248409548907007 578 0.48892 0.82033 0.0074462
## 202 0.000242634248409548879902 582 0.48795 0.82033 0.0074462
## 203 0.000241544998301759268741 584 0.48747 0.82033 0.0074462
## 204 0.000240968012467168452451 586 0.48699 0.82101 0.0074544
## 205 0.000239878762359379979702 588 0.48650 0.82086 0.0074526
## 206 0.000237123276309212895540 594 0.48482 0.82118 0.0074524
## 207 0.000234815332970841227815 595 0.48458 0.82193 0.0074567
## 208 0.000232636832755266667562 599 0.48337 0.82245 0.0075011
## 209 0.000232636832755266152566 600 0.48314 0.82644 0.0075374
## 210 0.000232636832755266071250 604 0.48221 0.82644 0.0075374
## 211 0.000232636832755266044145 609 0.48104 0.82644 0.0075374
## 212 0.000232636832755266017040 612 0.48035 0.82644 0.0075374
## 213 0.000232636832755265881515 618 0.47895 0.82644 0.0075374
## 214 0.000232059846920673194977 619 0.47872 0.82676 0.0075344
## 215 0.000232059846920673167872 621 0.47825 0.82668 0.0075345
## 216 0.000231970338378313884007 623 0.47779 0.82668 0.0075345
## 217 0.000230393610978292713316 628 0.47663 0.82674 0.0075391
## 218 0.000229993232383046892521 630 0.47617 0.82661 0.0075373
## 219 0.000229304360870505243453 648 0.47106 0.82706 0.0075399
## 220 0.000229304360870505107928 649 0.47084 0.82706 0.0075399
## 221 0.000228535046424381272897 650 0.47061 0.82702 0.0075409
## 222 0.000225971888985744198816 655 0.46923 0.82890 0.0075576
## 223 0.000225971888985744144606 656 0.46901 0.82868 0.0075559
## 224 0.000222062431266390630270 657 0.46878 0.83031 0.0075605
## 225 0.000222062431266390223695 665 0.46662 0.83130 0.0075824
## 226 0.000221485445431797482946 667 0.46618 0.83130 0.0075824
## 227 0.000221485445431797455841 669 0.46573 0.83141 0.0075824
## 228 0.000221485445431797320316 670 0.46551 0.83141 0.0075824
## 229 0.000219819209489417001285 673 0.46485 0.83130 0.0075871
## 230 0.000219819209489416947075 677 0.46397 0.83095 0.0075845
## 231 0.000213186626500594324403 681 0.46309 0.83272 0.0075947
## 232 0.000211488029777514836925 703 0.45643 0.83297 0.0075983
## 233 0.000211488029777514701399 706 0.45580 0.83528 0.0076221
## 234 0.000211488029777514647189 712 0.45438 0.83528 0.0076221
## 235 0.000211488029777514592979 717 0.45333 0.83528 0.0076221
## 236 0.000209244808000541235044 734 0.44967 0.83937 0.0076753
## 237 0.000208155557892753683867 736 0.44925 0.83974 0.0076769
## 238 0.000208155557892753548341 742 0.44800 0.83960 0.0076742
## 239 0.000208155557892753412816 743 0.44779 0.83960 0.0076742
## 240 0.000207001586223567904214 746 0.44717 0.83932 0.0076855
## 241 0.000206680732525572419603 750 0.44625 0.83948 0.0076817
## 242 0.000205925280428060117994 760 0.44332 0.83967 0.0076807
## 243 0.000203669114338806913787 766 0.44208 0.83943 0.0076790
## 244 0.000202002878396426865807 767 0.44188 0.84012 0.0076887
## 245 0.000202002878396426594756 793 0.43514 0.84073 0.0076939
## 246 0.000202002878396426459231 798 0.43404 0.84073 0.0076939
## 247 0.000200913628288638935158 800 0.43364 0.84107 0.0076937
## 248 0.000200913628288638880948 802 0.43324 0.84100 0.0076910
## 249 0.000200913628288638799633 804 0.43283 0.84100 0.0076910
## 250 0.000200336642454046004675 806 0.43243 0.84100 0.0076910
## 251 0.000199503524482855845159 810 0.43163 0.84191 0.0077024
## 252 0.000199247392346258426392 818 0.42991 0.84165 0.0077145
## 253 0.000197004170569285068457 820 0.42952 0.84189 0.0077089
## 254 0.000197004170569285041352 824 0.42873 0.84478 0.0077365
## 255 0.000195167179869173453141 830 0.42755 0.84531 0.0077417
## 256 0.000192176217499875190810 840 0.42519 0.84714 0.0077497
## 257 0.000191428476907550720095 843 0.42461 0.84768 0.0077585
## 258 0.000190339226799763548388 846 0.42397 0.84798 0.0077574
## 259 0.000190339226799763141813 850 0.42294 0.84793 0.0077565
## 260 0.000190339226799763033392 854 0.42218 0.84793 0.0077565
## 261 0.000189762240965170292644 861 0.42034 0.84797 0.0077556
## 262 0.000189228402838176117898 864 0.41969 0.84932 0.0077657
## 263 0.000188459088392052309972 867 0.41912 0.84910 0.0077640
## 264 0.000187262887051599488837 871 0.41826 0.84979 0.0077883
## 265 0.000187006754915002178490 877 0.41711 0.84976 0.0077874
## 266 0.000187006754915002124280 878 0.41693 0.84976 0.0077874
## 267 0.000186429769080409464847 879 0.41674 0.84961 0.0077875
## 268 0.000185895930953414964841 883 0.41599 0.84987 0.0077827
## 269 0.000183674283030241215168 890 0.41408 0.85040 0.0077833
## 270 0.000183641922249542832767 891 0.41389 0.85091 0.0078012
## 271 0.000180854075418675306220 930 0.40355 0.85165 0.0077989
## 272 0.000180298663437881496107 938 0.40182 0.85206 0.0078060
## 273 0.000179764825310887375572 941 0.40128 0.85250 0.0078111
## 274 0.000179187839476294553508 958 0.39776 0.85286 0.0078127
## 275 0.000178931707339697134741 960 0.39740 0.85286 0.0078118
## 276 0.000178098589368506948121 964 0.39668 0.85309 0.0078162
## 277 0.000178098589368506921016 966 0.39633 0.85315 0.0078162
## 278 0.000177594784144370442360 969 0.39574 0.85317 0.0078153
## 279 0.000175855367591533617291 981 0.39333 0.85351 0.0078160
## 280 0.000175855367591533563081 983 0.39298 0.85468 0.0078343
## 281 0.000174766117483746011903 985 0.39262 0.85480 0.0078269
## 282 0.000174317473128351253580 988 0.39205 0.85484 0.0078260
## 283 0.000170642757299061659498 997 0.39041 0.85794 0.0078548
## 284 0.000169767409656603482718 1006 0.38849 0.85890 0.0078678
## 285 0.000169190423822011717751 1040 0.37960 0.85892 0.0078687
## 286 0.000167524187879631263195 1055 0.37664 0.85929 0.0078720
## 287 0.000166947202045038278501 1057 0.37630 0.85967 0.0078763
## 288 0.000165857951937250808639 1060 0.37575 0.85985 0.0078780
## 289 0.000165857951937250754429 1062 0.37542 0.85989 0.0078780
## 290 0.000165857951937250673114 1064 0.37509 0.85989 0.0078780
## 291 0.000165280966102657878155 1065 0.37493 0.85989 0.0078780
## 292 0.000165280966102657851050 1067 0.37459 0.86014 0.0078760
## 293 0.000162525480052489899527 1077 0.37294 0.85964 0.0078916
## 294 0.000162525480052489764001 1078 0.37278 0.86025 0.0078948
## 295 0.000160645341644778932161 1080 0.37245 0.86095 0.0078953
## 296 0.000158616022333136005721 1083 0.37197 0.86149 0.0079201
## 297 0.000158616022333135951511 1085 0.37166 0.86149 0.0079210
## 298 0.000158616022333135680460 1093 0.37039 0.86149 0.0079210
## 299 0.000158039036498543075237 1095 0.37007 0.86167 0.0079236
## 300 0.000158039036498543021027 1099 0.36944 0.86180 0.0079235
## 301 0.000156949786390755496954 1103 0.36869 0.86188 0.0079234
## 302 0.000156949786390755361429 1105 0.36837 0.86278 0.0079318
## 303 0.000155432731352307272523 1108 0.36785 0.86324 0.0079351
## 304 0.000154706564613782111915 1114 0.36666 0.86296 0.0079353
## 305 0.000154129578779189262746 1118 0.36604 0.86300 0.0079343
## 306 0.000153873446642591816874 1121 0.36544 0.86311 0.0079352
## 307 0.000153595740652195142210 1125 0.36482 0.86300 0.0079343
## 308 0.000152869573913670117127 1128 0.36436 0.86300 0.0079343
## 309 0.000151758749952083201633 1131 0.36391 0.86391 0.0079570
## 310 0.000148767787582785291668 1135 0.36327 0.86464 0.0079538
## 311 0.000148041620844260429215 1139 0.36266 0.86508 0.0079580
## 312 0.000148041620844260158165 1141 0.36236 0.86520 0.0079562
## 313 0.000148041620844260049744 1143 0.36207 0.86520 0.0079562
## 314 0.000147464635009667363206 1146 0.36162 0.86520 0.0079562
## 315 0.000145798399067286881545 1149 0.36108 0.86525 0.0079579
## 316 0.000144709148959499303263 1155 0.36005 0.86565 0.0079559
## 317 0.000144709148959499249052 1159 0.35947 0.86595 0.0079583
## 318 0.000144709148959498978002 1169 0.35802 0.86595 0.0079583
## 319 0.000144132163124906399884 1170 0.35788 0.86595 0.0079583
## 320 0.000141376677074738475465 1174 0.35730 0.86615 0.0079591
## 321 0.000141376677074738339940 1175 0.35716 0.86634 0.0079590
## 322 0.000141376677074738068890 1177 0.35688 0.86634 0.0079590
## 323 0.000140607362628614477804 1179 0.35660 0.86701 0.0079692
## 324 0.000140222705405552560288 1182 0.35617 0.86760 0.0079858
## 325 0.000138977848139509667889 1183 0.35603 0.86746 0.0079912
## 326 0.000138770371928502374596 1192 0.35462 0.86752 0.0079912
## 327 0.000137467219355384635869 1195 0.35420 0.86753 0.0079929
## 328 0.000137467219355384500344 1197 0.35393 0.86754 0.0079912
## 329 0.000137467219355384473239 1201 0.35338 0.86754 0.0079912
## 330 0.000137467219355384446134 1203 0.35310 0.86754 0.0079912
## 331 0.000137467219355384364819 1205 0.35283 0.86754 0.0079912
## 332 0.000136890233520791651176 1207 0.35255 0.86814 0.0080218
## 333 0.000136890233520791596965 1213 0.35173 0.86814 0.0080218
## 334 0.000135394752336142574220 1214 0.35160 0.86856 0.0080189
## 335 0.000135223997578411142409 1221 0.35020 0.86869 0.0080197
## 336 0.000133942418859092524054 1223 0.34993 0.86972 0.0080084
## 337 0.000133557761636030796273 1226 0.34953 0.86972 0.0080084
## 338 0.000131346900639756715206 1228 0.34926 0.87103 0.0080279
## 339 0.000126892817866508788313 1255 0.34359 0.87253 0.0080560
## 340 0.000126892817866508761208 1265 0.34232 0.87305 0.0080513
## 341 0.000126892817866508734103 1271 0.34156 0.87305 0.0080513
## 342 0.000125781993904921791504 1273 0.34130 0.87318 0.0080530
## 343 0.000124671169943334821800 1276 0.34093 0.87336 0.0080555
## 344 0.000124671169943334740485 1279 0.34055 0.87363 0.0080544
## 345 0.000123560345981748204462 1282 0.34018 0.87363 0.0080544
## 346 0.000123560345981747879201 1286 0.33968 0.87459 0.0080626
## 347 0.000123560345981747824991 1287 0.33956 0.87459 0.0080626
## 348 0.000123560345981747689466 1297 0.33832 0.87459 0.0080626
## 349 0.000122983360147154948717 1298 0.33820 0.87472 0.0080634
## 350 0.000122983360147154894507 1300 0.33795 0.87512 0.0080623
## 351 0.000121317124204774467056 1302 0.33771 0.87599 0.0080678
## 352 0.000121252402643376265686 1316 0.33601 0.87606 0.0080696
## 353 0.000120227874096987037852 1317 0.33589 0.87606 0.0080687
## 354 0.000119843216873924930600 1318 0.33577 0.87621 0.0080703
## 355 0.000118732392912337947344 1321 0.33541 0.87626 0.0080703
## 356 0.000117236911727689060124 1324 0.33505 0.87629 0.0080694
## 357 0.000116863041431526824766 1327 0.33470 0.87654 0.0080780
## 358 0.000116318416377633076283 1331 0.33423 0.87654 0.0080780
## 359 0.000116318416377633035625 1339 0.33322 0.87702 0.0080804
## 360 0.000116318416377633022073 1345 0.33252 0.87702 0.0080804
## 361 0.000116318416377633008520 1347 0.33229 0.87702 0.0080804
## 362 0.000115741430543040186457 1358 0.33096 0.87711 0.0080821
## 363 0.000113562930327464948576 1359 0.33084 0.87734 0.0080889
## 364 0.000112408958658279250239 1360 0.33073 0.87987 0.0081284
## 365 0.000111682791919754089631 1365 0.33005 0.88040 0.0081333
## 366 0.000111298134696692185668 1370 0.32933 0.88027 0.0081334
## 367 0.000110845175570537641217 1373 0.32900 0.88065 0.0081331
## 368 0.000110742722715898741473 1380 0.32781 0.88048 0.0081315
## 369 0.000110571967958167119927 1382 0.32759 0.88041 0.0081315
## 370 0.000105744014888757350700 1386 0.32714 0.88121 0.0081415
## 371 0.000105744014888757296490 1390 0.32671 0.88165 0.0081447
## 372 0.000104077778946376841933 1408 0.32443 0.88208 0.0081409
## 373 0.000103522366965583302871 1410 0.32422 0.88217 0.0081417
## 374 0.000102411543003996387377 1416 0.32360 0.88275 0.0081448
## 375 0.000102411543003996319615 1427 0.32248 0.88275 0.0081448
## 376 0.000102411543003996251852 1430 0.32217 0.88275 0.0081448
## 377 0.000102411543003996129879 1432 0.32196 0.88275 0.0081448
## 378 0.000099719676871628338882 1434 0.32176 0.88180 0.0080631
## 379 0.000099079071119235464712 1440 0.32099 0.88197 0.0080630
## 380 0.000099079071119235410502 1443 0.32069 0.88194 0.0080639
## 381 0.000095169613399881516696 1444 0.32059 0.88218 0.0080681
## 382 0.000094592627565288721738 1448 0.32021 0.88230 0.0080698
## 383 0.000092823938768269245465 1453 0.31974 0.88251 0.0080714
## 384 0.000091644812903589662379 1462 0.31849 0.88303 0.0080737
## 385 0.000091260155680527744863 1466 0.31811 0.88259 0.0080749
## 386 0.000090533988942002611359 1529 0.31236 0.88270 0.0080757
## 387 0.000089593919738147222544 1535 0.31181 0.88411 0.0081159
## 388 0.000083484387949418848514 1549 0.31032 0.88434 0.0081210
## 389 0.000081262740026244949763 1552 0.31007 0.88521 0.0081274
## 390 0.000081262740026244922658 1556 0.30975 0.88583 0.0081279
## 391 0.000081262740026244882001 1561 0.30934 0.88583 0.0081279
## 392 0.000081262740026244814238 1579 0.30787 0.88583 0.0081279
## 393 0.000080878082803182978038 1583 0.30753 0.88583 0.0081279
## 394 0.000079019518249271510513 1586 0.30729 0.88631 0.0081328
## 395 0.000077930268141484040651 1590 0.30698 0.88629 0.0081337
## 396 0.000077930268141483972888 1592 0.30682 0.88636 0.0081345
## 397 0.000077930268141483905126 1593 0.30674 0.88636 0.0081345
## 398 0.000077545610918422028268 1596 0.30651 0.88678 0.0081290
## 399 0.000076776296472298206789 1599 0.30628 0.88678 0.0081290
## 400 0.000074597796256722996014 1600 0.30620 0.88671 0.0081273
## 401 0.000072354574479749746499 1601 0.30612 0.88710 0.0081288
## 402 0.000072354574479749651631 1608 0.30550 0.88702 0.0081288
## 403 0.000070688338537369169970 1610 0.30535 0.88702 0.0081288
## 404 0.000070496009925838238317 1612 0.30521 0.88702 0.0081288
## 405 0.000070111352702776307249 1615 0.30500 0.88702 0.0081288
## 406 0.000070111352702776280144 1616 0.30493 0.88702 0.0081288
## 407 0.000068733609677692250172 1618 0.30479 0.88702 0.0081288
## 408 0.000067932852487201042264 1622 0.30452 0.88790 0.0081378
## 409 0.000067900491706501995789 1623 0.30445 0.88805 0.0081386
## 410 0.000067384784797062748682 1627 0.30418 0.88805 0.0081386
## 411 0.000067355866652607867835 1639 0.30329 0.88814 0.0081385
## 412 0.000065668056856428374223 1641 0.30316 0.88816 0.0081725
## 413 0.000061780172990873844733 1648 0.30263 0.88864 0.0081756
## 414 0.000060113937048493451163 1654 0.30225 0.88855 0.0081852
## 415 0.000060113937048493417282 1676 0.30093 0.88864 0.0081869
## 416 0.000060113937048493390177 1684 0.30045 0.88864 0.0081869
## 417 0.000057870715271520093228 1694 0.29985 0.88871 0.0081877
## 418 0.000056781465163732474288 1696 0.29973 0.88897 0.0081849
## 419 0.000056492972246436049704 1700 0.29951 0.88953 0.0081915
## 420 0.000052295021609785792300 1704 0.29928 0.89009 0.0081894
## 421 0.000051205771501998193689 1706 0.29918 0.89012 0.0081902
## 422 0.000048962549725024815425 1712 0.29887 0.89204 0.0081828
## 423 0.000045630077840263872431 1715 0.29872 0.89191 0.0081838
## 424 0.000045630077840263838550 1743 0.29744 0.89229 0.0081827
## 425 0.000040631370013122474882 1747 0.29726 0.89293 0.0081900
## 426 0.000040631370013122441000 1749 0.29718 0.89291 0.0081900
## 427 0.000038965134070741952563 1753 0.29702 0.89300 0.0081900
## 428 0.000035632662185981043451 1757 0.29680 0.89292 0.0081978
## 429 0.000035632662185980975688 1761 0.29665 0.89286 0.0081979
## 430 0.000035632662185980785953 1764 0.29655 0.89286 0.0081979
## 431 0.000035055676351388140072 1765 0.29651 0.89319 0.0082011
## 432 0.000030420051893509251676 1767 0.29644 0.89283 0.0082005
## 433 0.000030056968524246759463 1773 0.29626 0.89218 0.0081931
## 434 0.000030056968524246695088 1775 0.29620 0.89232 0.0081957
## 435 0.000027813746747273384587 1781 0.29602 0.89225 0.0081948
## 436 0.000027813746747273350706 1782 0.29599 0.89219 0.0081957
## 437 0.000025976756047161386412 1784 0.29593 0.89247 0.0081964
## 438 0.000025058260697105263657 1787 0.29586 0.89247 0.0081964
## 439 0.000020037979016164461134 1789 0.29581 0.89245 0.0081973
## 440 0.000015028484262123379731 1795 0.29569 0.89225 0.0081983
## 441 0.000014483859208229739668 1799 0.29563 0.89256 0.0082085
## 442 0.000014483859208229610919 1800 0.29561 0.89256 0.0082085
## 443 0.000014483859208229546544 1805 0.29554 0.89256 0.0082085
## 444 0.000014483859208229482170 1809 0.29548 0.89256 0.0082085
## 445 0.000006664943769521919021 1810 0.29547 0.89283 0.0082118
## 446 0.000004486443553946748904 1811 0.29546 0.89281 0.0082074
## 447 0.000003332471884760959511 1812 0.29546 0.89275 0.0082075
## 448 0.000000000000000000258553 1814 0.29545 0.89274 0.0082066
## 449 0.000000000000000000129277 1815 0.29545 0.89274 0.0082066
## 450 0.000000000000000000064638 1816 0.29545 0.89274 0.0082066
## 451 0.000000000000000000032319 1823 0.29545 0.89274 0.0082066
## 452 0.000000000000000000000000 1828 0.29545 0.89274 0.0082066
lcDT1b$variable.importance %>% head(10)
## sub_grade int_rate grade emp_length
## 2565.000 2349.448 1842.277 1540.025
## bc_open_to_buy total_rev_hi_lim total_bc_limit tot_cur_bal
## 1463.557 1432.300 1379.775 1245.864
## revol_bal avg_cur_bal
## 1103.546 1084.378
# Pruning the balanced tree
lcDT1bp<- prune.rpart(lcDT1b, cp=0.001301)
printcp(lcDT1bp)
##
## Classification tree:
## rpart(formula = loan_status ~ ., data = lcdfTrn %>% select(-all_of(varsOmit)),
## method = "class", parms = list(split = "gini", prior = c(0.5,
## 0.5)), control = rpart.control(cp = 0, minsplit = 20,
## minbucket = 10, maxdepth = 20, xval = 10))
##
## Variables actually used in tree construction:
## [1] acc_open_past_24mths avg_cur_bal emp_length
## [4] int_rate mo_sin_rcnt_tl mths_since_last_delinq
## [7] mths_since_recent_inq num_bc_tl sub_grade
##
## Root node error: 27481/54963 = 0.5
##
## n= 54963
##
## CP nsplit rel error xerror xstd
## 1 0.2353865 0 1.00000 1.01220 0.0083281
## 2 0.0037004 1 0.76461 0.76487 0.0062163
## 3 0.0033598 4 0.75351 0.76295 0.0064583
## 4 0.0025615 5 0.75015 0.75930 0.0066369
## 5 0.0020340 6 0.74759 0.75670 0.0068260
## 6 0.0017755 7 0.74556 0.75754 0.0067765
## 7 0.0015072 8 0.74378 0.75742 0.0067661
## 8 0.0013993 9 0.74227 0.75746 0.0067450
## 9 0.0013011 11 0.73948 0.75712 0.0066647
## 10 0.0013010 12 0.73817 0.75907 0.0066573
lcDT1bp$variable.importance %>% head(10)
## sub_grade int_rate grade
## 1780.72699 1512.51791 1506.56408
## bc_open_to_buy total_bc_limit total_rev_hi_lim
## 480.07945 413.25344 371.68269
## acc_open_past_24mths avg_cur_bal mo_sin_rcnt_tl
## 94.11861 68.38363 40.82906
## num_tl_op_past_12m
## 40.49923
plot(lcDT1bp)
We had a dataset which was first split into 50:50. We created a model changed the cost parameter.Then we pruned the tree with different values of cp. We also created a balanced model, later pruned the tree.
# Using the predict function, training data set
confusionM <- function(models, data) {
predTrn=predict(models,data, type='class')
tab1 = table(predicted = predTrn, true=data$loan_status)
print(mean(predTrn == data$loan_status))
return(tab1)
}
# Model with fully grown tree
# Train
confusionM(lcDT1,lcdfTrn)
## [1] 0.8724597
## true
## predicted Charged Off Fully Paid
## Charged Off 1555 886
## Fully Paid 6124 46398
# Test
confusionM(lcDT1, lcdfTst)
## [1] 0.8396922
## true
## predicted Charged Off Fully Paid
## Charged Off 625 1739
## Fully Paid 7072 45527
# Model with pruned tree with following p values
confusionM(lcDT1p2,lcdfTrn)
## [1] 0.8659462
## true
## predicted Charged Off Fully Paid
## Charged Off 629 318
## Fully Paid 7050 46966
confusionM(lcDT1p3,lcdfTrn)
## [1] 0.8634718
## true
## predicted Charged Off Fully Paid
## Charged Off 357 182
## Fully Paid 7322 47102
# Model with balanced dataset
confusionM(lcDT1b, lcdfTrn)
## [1] 0.7922784
## true
## predicted Charged Off Fully Paid
## Charged Off 7184 10922
## Fully Paid 495 36362
# Model balanced and pruned
confusionM(lcDT1bp, lcdfTrn)
## [1] 0.622728
## true
## predicted Charged Off Fully Paid
## Charged Off 4932 17989
## Fully Paid 2747 29295
We qualified all the results above 0.5 towards charged off, we will now change the threshold to a lower value. This change is based towards our goal towards detecting Charged Off Loans well. The threshold value changes are made based on the goal you want to achieve. Trying out multiple thresholds is also an option.
# 1. Using this threshold for train and test dataset
CTHRESH=0.3
# Using the model which is fully grown
predProbTrn=predict(lcDT1,lcdfTrn, type='prob')
predTrnCT = ifelse(predProbTrn[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTrnCT , true=lcdfTrn$loan_status)
## true
## predTrnCT Charged Off Fully Paid
## Charged Off 1896 1465
## Fully Paid 5783 45819
predProbTst=predict(lcDT1,lcdfTst, type='prob')
predTstCT = ifelse(predProbTst[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTstCT , true=lcdfTst$loan_status)
## true
## predTstCT Charged Off Fully Paid
## Charged Off 872 2449
## Fully Paid 6825 44817
# Building the roc and auc curve
score=predict(lcDT1,lcdfTst, type="prob")[,"Charged Off"]
pred=prediction(score, lcdfTst$loan_status, label.ordering = c("Fully Paid", "Charged Off"))
#label.ordering here specifies the 'negative', 'positive' class labels
# Closer to one specifies charged off
#ROC curve
aucPerf <-performance(pred, "tpr", "fpr")
plot(aucPerf)
abline(a=0, b= 1)
#AUC value
aucPerf=performance(pred, "auc")
aucPerf@y.values
## [[1]]
## [1] 0.6301163
# [[1]]
# [1] 0.6400753
#Lift curve
liftPerf <-performance(pred, "lift", "rpp")
plot(liftPerf)
# 2. Using the model which were pruned - p2
predProbTrn=predict(lcDT1p2,lcdfTrn, type='prob')
predTrnCT = ifelse(predProbTrn[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTrnCT , true=lcdfTrn$loan_status)
## true
## predTrnCT Charged Off Fully Paid
## Charged Off 796 643
## Fully Paid 6883 46641
predProbTst=predict(lcDT1p2,lcdfTst, type='prob')
predTstCT = ifelse(predProbTst[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTstCT , true=lcdfTst$loan_status)
## true
## predTstCT Charged Off Fully Paid
## Charged Off 426 1006
## Fully Paid 7271 46260
# Building the roc and auc curve
score=predict(lcDT1p2,lcdfTst, type="prob")[,"Charged Off"]
pred=prediction(score, lcdfTst$loan_status, label.ordering = c("Fully Paid", "Charged Off"))
#label.ordering here specifies the 'negative', 'positive' class labels
# Closer to one specifies charged off
#ROC curve
aucPerf <-performance(pred, "tpr", "fpr")
plot(aucPerf)
abline(a=0, b= 1)
#AUC value
aucPerf=performance(pred, "auc")
aucPerf@y.values
## [[1]]
## [1] 0.6282773
# [[1]]
# [1] 0.6400753
#Lift curve
liftPerf <-performance(pred, "lift", "rpp")
plot(liftPerf)
# 3. Using the model which were pruned - p2
predProbTrn=predict(lcDT1p3,lcdfTrn, type='prob')
predTrnCT = ifelse(predProbTrn[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTrnCT , true=lcdfTrn$loan_status)
## true
## predTrnCT Charged Off Fully Paid
## Charged Off 407 271
## Fully Paid 7272 47013
predProbTst=predict(lcDT1p3,lcdfTst, type='prob')
predTstCT = ifelse(predProbTst[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTstCT , true=lcdfTst$loan_status)
## true
## predTstCT Charged Off Fully Paid
## Charged Off 211 455
## Fully Paid 7486 46811
# Building the roc and auc curve
score=predict(lcDT1p3,lcdfTst, type="prob")[,"Charged Off"]
pred=prediction(score, lcdfTst$loan_status, label.ordering = c("Fully Paid", "Charged Off"))
#label.ordering here specifies the 'negative', 'positive' class labels
# Closer to one specifies charged off
#ROC curve
aucPerf <-performance(pred, "tpr", "fpr")
plot(aucPerf)
abline(a=0, b= 1)
#AUC value
aucPerf=performance(pred, "auc")
aucPerf@y.values
## [[1]]
## [1] 0.6344883
#Lift curve
liftPerf <-performance(pred, "lift", "rpp")
plot(liftPerf)
# 4. Using the model which was balanced
predProbTrn=predict(lcDT1b,lcdfTrn, type='prob')
predTrnCT = ifelse(predProbTrn[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTrnCT , true=lcdfTrn$loan_status)
## true
## predTrnCT Charged Off Fully Paid
## Charged Off 7477 13629
## Fully Paid 202 33655
predProbTst=predict(lcDT1b,lcdfTst, type='prob')
predTstCT = ifelse(predProbTst[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTstCT , true=lcdfTst$loan_status)
## true
## predTstCT Charged Off Fully Paid
## Charged Off 3790 17286
## Fully Paid 3907 29980
# Building the roc and auc curve
score=predict(lcDT1b,lcdfTst, type="prob")[,"Charged Off"]
pred=prediction(score, lcdfTst$loan_status, label.ordering = c("Fully Paid", "Charged Off"))
#label.ordering here specifies the 'negative', 'positive' class labels
# Closer to one specifies charged off
#ROC curve
aucPerf <-performance(pred, "tpr", "fpr")
plot(aucPerf)
abline(a=0, b= 1)
#AUC value
aucPerf=performance(pred, "auc")
aucPerf@y.values
## [[1]]
## [1] 0.5696635
#Lift curve
liftPerf <-performance(pred, "lift", "rpp")
plot(liftPerf)
# 5. Using the model which was balanced and pruned
predProbTrn=predict(lcDT1bp,lcdfTrn, type='prob')
predTrnCT = ifelse(predProbTrn[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTrnCT , true=lcdfTrn$loan_status)
## true
## predTrnCT Charged Off Fully Paid
## Charged Off 7679 47234
## Fully Paid 0 50
predProbTst=predict(lcDT1bp,lcdfTst, type='prob')
predTstCT = ifelse(predProbTst[, 'Charged Off'] > CTHRESH, 'Charged Off', 'Fully Paid')
table(predTstCT , true=lcdfTst$loan_status)
## true
## predTstCT Charged Off Fully Paid
## Charged Off 7692 47237
## Fully Paid 5 29
# Building the roc and auc curve
score=predict(lcDT1bp,lcdfTst, type="prob")[,"Charged Off"]
pred=prediction(score, lcdfTst$loan_status, label.ordering = c("Fully Paid", "Charged Off"))
#label.ordering here specifies the 'negative', 'positive' class labels
# Closer to one specifies charged off
#ROC curve
aucPerf <-performance(pred, "tpr", "fpr")
plot(aucPerf)
abline(a=0, b= 1)
#AUC value
aucPerf=performance(pred, "auc")
aucPerf@y.values
## [[1]]
## [1] 0.6442701
#Lift curve
liftPerf <-performance(pred, "lift", "rpp")
plot(liftPerf)
We have observed both confusion matrix and roc auc curve for the following models - The model which was fully grown, pruned, and balanced - we added the threshold values as well.
This algorithm uses an information entropy computation to determine the best rule that splits the data, at that node, into purer classes by minimizing the computed entropy value. This means that as each node splits the data, based on the rule at that node, each subset of data split by the rule will contain less diversity of classes and will, eventually, contain only one class [complete purity]. This process is simple to compute and therefore C50 runs quickly. C50 is robust. It can work with both numeric or categorical data [this example shows both types]. It can also tolerate missing data values. The output from the R implementation can be either a decision tree or a rule set. The output model can be used to assign [predict] a class to new unclassified data items. Reference - http://mercury.webster.edu/aleshunas/R_learning_infrastructure/Classification%20of%20data%20using%20decision%20tree%20and%20regression%20tree%20methods.html
#Model 1
c5_DT1 <- C5.0(loan_status ~., data=lcdfTrn %>% select(-all_of(varsOmit)), control=C5.0Control(minCases=30))
summary(c5_DT1)
##
## Call:
## C5.0.formula(formula = loan_status ~ ., data = lcdfTrn
## %>% select(-all_of(varsOmit)), control = C5.0Control(minCases = 30))
##
##
## C5.0 [Release 2.07 GPL Edition] Sat Feb 19 18:41:31 2022
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 54963 cases (65 attributes) from undefined.data
##
## Decision tree:
## Fully Paid (54963/7679)
##
##
## Evaluation on training data (54963 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 1 7679(14.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 7679 (a): class Charged Off
## 47284 (b): class Fully Paid
##
##
## Time: 1.4 secs
# Model 2
# only one root node --- due to class imbalance
#Is it maybe due to the class imbalance in the data. Let us check the train data .
lcdfTrn %>% group_by(loan_status) %>% tally()
## # A tibble: 2 × 2
## loan_status n
## <fct> <int>
## 1 Charged Off 7679
## 2 Fully Paid 47284
#To consider a more balanced data for building the tree, C%.0 has a 'weights' parameter - this can specify a vector of weights for each example
#Suppose we want to weight the 'Charged Off' examples as 6, and 'Fully Paid' examples as 1
caseWeights <- ifelse(lcdfTrn$loan_status=="Charged Off", 6, 1)
## Error
c5_DT2 <- C5.0(loan_status ~., data=lcdfTrn %>% select(-all_of(varsOmit)), weights = caseWeights, control=C5.0Control(minCases=30))
summary(c5_DT2)
##
## Call:
## C5.0.formula(formula = loan_status ~ ., data = lcdfTrn
## %>% select(-all_of(varsOmit)), weights = caseWeights, control
## = C5.0Control(minCases = 30))
##
##
## C5.0 [Release 2.07 GPL Edition] Sat Feb 19 18:41:40 2022
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 54963 cases (66 attributes) from undefined.data
## Using relative case weighting
##
## Decision tree:
##
## int_rate > 11.22:
## :...sub_grade in {A1,A2,A3,A4,A5,B1,B2,G5}: Charged Off (0)
## : sub_grade in {C3,C4,C5,D1,D2,D3,D4,D5,E1,E2,E3,E4,E5,F1,F2,F3,F4,F5,G1,G2,
## : : G3}:
## : :...num_rev_accts > 45: Fully Paid (37.1/7.1)
## : : num_rev_accts <= 45:
## : : :...avg_cur_bal > 22133:
## : : :...mo_sin_old_il_acct > 231: Fully Paid (50/10.6)
## : : : mo_sin_old_il_acct <= 231:
## : : : :...verification_status = Not Verified:
## : : : :...sub_grade in {C3,C5,E2,E4,F2,F3,F4,F5,G1,G2,
## : : : : : G3}: Fully Paid (114.8/21.2)
## : : : : sub_grade in {C4,D1,D2,D3,D4,D5,E1,E3,E5,F1}:
## : : : : :...num_sats > 14: Charged Off (31.2/6.5)
## : : : : num_sats <= 14:
## : : : : :...num_rev_accts > 14: Fully Paid (42.4/10.6)
## : : : : num_rev_accts <= 14:
## : : : : :...annual_inc <= 44500: Fully Paid (32.4/7.1)
## : : : : annual_inc > 44500: [S1]
## : : : verification_status in {Source Verified,Verified}:
## : : : :...open_acc > 18: Fully Paid (55.3/17.7)
## : : : open_acc <= 18:
## : : : :...mths_since_recent_bc > 78:
## : : : :...int_rate <= 17.27: Fully Paid (73/14.1)
## : : : : int_rate > 17.27: Charged Off (47.7/19.4)
## : : : mths_since_recent_bc <= 78:
## : : : :...tax_liens > 1: Charged Off (40.6/8.8)
## : : : tax_liens <= 1:
## : : : :...num_il_tl > 25: Charged Off (47.1/8.2)
## : : : num_il_tl <= 25:
## : : : :...total_acc > 42: Fully Paid (55.3/14.1)
## : : : total_acc <= 42:
## : : : :...dti <= 5.13: [S2]
## : : : dti > 5.13:
## : : : :...mths_since_last_delinq > 48: [S3]
## : : : mths_since_last_delinq <= 48: [S4]
## : : avg_cur_bal <= 22133:
## : : :...installment <= 158.68:
## : : :...bc_open_to_buy > 17606: Fully Paid (39.4/7.1)
## : : : bc_open_to_buy <= 17606:
## : : : :...num_op_rev_tl > 14: Charged Off (129.5/30.6)
## : : : num_op_rev_tl <= 14:
## : : : :...num_bc_tl > 14: Fully Paid (40.6/7.1)
## : : : num_bc_tl <= 14:
## : : : :...mort_acc > 2:
## : : : :...openAccRatio <= 0.2833333: Fully Paid (30.6)
## : : : : openAccRatio > 0.2833333:
## : : : : :...revol_bal > 12760: Fully Paid (35.9/3.5)
## : : : : revol_bal <= 12760: [S5]
## : : : mort_acc <= 2:
## : : : :...mo_sin_old_il_acct > 237: Charged Off (74.8/14.7)
## : : : mo_sin_old_il_acct <= 237:
## : : : :...emp_length in {n/a,9 years}: [S6]
## : : : emp_length in {< 1 year,1 year,2 years,
## : : : : 3 years,4 years,5 years,
## : : : : 6 years,7 years,8 years,
## : : : : 10+ years}:
## : : : :...bc_util > 100.4: Fully Paid (40.6/7.1)
## : : : bc_util <= 100.4: [S7]
## : : installment > 158.68:
## : : :...grade in {A,B}: Charged Off (0)
## : : grade in {E,F,G}:
## : : :...num_tl_90g_dpd_24m > 2: Charged Off (38.3/2.9)
## : : : num_tl_90g_dpd_24m <= 2:
## : : : :...revol_bal > 28452:
## : : : :...home_ownership = MORTGAGE: Fully Paid (41.8/14.1)
## : : : : home_ownership in {ANY,NONE,OTHER,OWN,
## : : : : RENT}: Charged Off (87.7/24.1)
## : : : revol_bal <= 28452:
## : : : :...dti > 8.86: Charged Off (2144.8/516.3)
## : : : dti <= 8.86:
## : : : :...delinq_2yrs > 0: [S8]
## : : : delinq_2yrs <= 0: [S9]
## : : grade in {C,D}:
## : : :...mo_sin_rcnt_tl > 15:
## : : :...total_bal_ex_mort <= 5022: Fully Paid (38.3/7.1)
## : : : total_bal_ex_mort > 5022:
## : : : :...mths_since_recent_inq <= 1: Charged Off (176.6/45.9)
## : : : mths_since_recent_inq > 1: [S10]
## : : mo_sin_rcnt_tl <= 15:
## : : :...sub_grade in {E1,E2,E3,E4,E5,F1,F2,F3,F4,F5,G1,G2,
## : : : G3}: Charged Off (0)
## : : sub_grade in {D2,D3,D4,D5}:
## : : :...pub_rec_bankruptcies > 1: Charged Off (76.5/13)
## : : : pub_rec_bankruptcies <= 1:
## : : : :...mort_acc > 3: [S11]
## : : : mort_acc <= 3:
## : : : :...int_rate > 18.24: [S12]
## : : : int_rate <= 18.24: [S13]
## : : sub_grade in {C3,C4,C5,D1}:
## : : :...mths_since_recent_bc > 103: Fully Paid (78.3/31.8)
## : : mths_since_recent_bc <= 103:
## : : :...acc_open_past_24mths <= 3: [S14]
## : : acc_open_past_24mths > 3:
## : : :...inq_last_6mths > 4: Fully Paid (35.3/14.1)
## : : inq_last_6mths <= 4:
## : : :...annual_inc <= 21002: Charged Off (140.7/24.1)
## : : annual_inc > 21002:
## : : :...num_rev_tl_bal_gt_0 > 10: [S15]
## : : num_rev_tl_bal_gt_0 <= 10:
## : : :...total_acc > 27: [S16]
## : : total_acc <= 27: [S17]
## : sub_grade in {B3,B4,B5,C1,C2,G4}:
## : :...mort_acc > 7: Fully Paid (137.8/28.3)
## : mort_acc <= 7:
## : :...purpose = house: Fully Paid (31.8/3.5)
## : purpose in {car,credit_card,debt_consolidation,home_improvement,
## : : major_purchase,medical,moving,other,small_business,
## : : vacation}:
## : :...mo_sin_old_il_acct <= 4: Charged Off (58.3/12.4)
## : mo_sin_old_il_acct > 4:
## : :...acc_open_past_24mths > 6:
## : :...pct_tl_nvr_dlq <= 72.5: Fully Paid (53.6/14.1)
## : : pct_tl_nvr_dlq > 72.5:
## : : :...sub_grade = G4: Fully Paid (2.4)
## : : sub_grade in {B3,B4}:
## : : :...total_bal_ex_mort > 98375: Fully Paid (39.4)
## : : : total_bal_ex_mort <= 98375: [S18]
## : : sub_grade in {B5,C1,C2}:
## : : :...borrHistory <= 8.246407:
## : : :...sub_grade = C1: Fully Paid (54.2/7.1)
## : : : sub_grade in {B5,C2}:
## : : : :...num_tl_op_past_12m <= 2:
## : : : :...total_acc <= 15: Charged Off (32.4/4.1)
## : : : : total_acc > 15: Fully Paid (38.3/17.7)
## : : : num_tl_op_past_12m > 2:
## : : : :...avg_cur_bal <= 8169: Fully Paid (67.7/17.7)
## : : : avg_cur_bal > 8169: Charged Off (34.1/9.4)
## : : borrHistory > 8.246407:
## : : :...tot_cur_bal <= 135952:
## : : :...int_rate <= 12.88: [S19]
## : : : int_rate > 12.88:
## : : : :...num_il_tl > 21: Charged Off (79.5/15.9)
## : : : num_il_tl <= 21: [S20]
## : : tot_cur_bal > 135952:
## : : :...percent_bc_gt_75 <= 33.3: [S21]
## : : percent_bc_gt_75 > 33.3:
## : : :...num_rev_tl_bal_gt_0 <= 5: [S22]
## : : num_rev_tl_bal_gt_0 > 5: [S23]
## : acc_open_past_24mths <= 6:
## : :...mo_sin_rcnt_rev_tl_op > 57:
## : :...emp_length in {3 years,7 years,8 years,
## : : : 9 years}: Fully Paid (46.5)
## : : emp_length in {n/a,< 1 year,1 year,2 years,4 years,
## : : : 5 years,6 years,10+ years}:
## : : :...mort_acc > 1: Fully Paid (92.4/17.7)
## : : mort_acc <= 1: [S24]
## : mo_sin_rcnt_rev_tl_op <= 57:
## : :...tot_hi_cred_lim > 196702:
## : :...num_bc_tl <= 2: Fully Paid (50.6/7.1)
## : : num_bc_tl > 2:
## : : :...int_rate > 14.33: Fully Paid (49.5/7.1)
## : : int_rate <= 14.33:
## : : :...dti > 24.98:
## : : :...revol_util <= 37.3: Fully Paid (32.4/3.5)
## : : : revol_util > 37.3: [S25]
## : : dti <= 24.98: [S26]
## : tot_hi_cred_lim <= 196702:
## : :...loan_amnt > 34900: Charged Off (131.3/35.9)
## : loan_amnt <= 34900:
## : :...annual_inc > 83250: [S27]
## : annual_inc <= 83250:
## : :...num_rev_tl_bal_gt_0 > 7:
## : :...int_rate > 14.31: [S28]
## : : int_rate <= 14.31: [S29]
## : num_rev_tl_bal_gt_0 <= 7:
## : :...num_rev_accts > 22: [S30]
## : num_rev_accts <= 22:
## : :...total_acc > 33: [S31]
## : total_acc <= 33:
## : :...acc_open_past_24mths > 3: [S32]
## : acc_open_past_24mths <= 3: [S33]
## int_rate <= 11.22:
## :...sub_grade in {C1,C4,D1,D2,D3,D4,D5,E1,E2,E3,E4,E5,F1,F2,F3,F4,F5,G1,G2,G3,
## : G4,G5}: Fully Paid (0)
## sub_grade in {A1,A2,A3,C2,C3,C5}:
## :...num_rev_tl_bal_gt_0 > 17: Charged Off (30/12.4)
## : num_rev_tl_bal_gt_0 <= 17:
## : :...num_accts_ever_120_pd > 3: Fully Paid (40)
## : num_accts_ever_120_pd <= 3:
## : :...num_bc_sats > 14: Fully Paid (44.7)
## : num_bc_sats <= 14:
## : :...tax_liens > 0:
## : :...mo_sin_old_rev_tl_op <= 168: Charged Off (38.3/17.1)
## : : mo_sin_old_rev_tl_op > 168: Fully Paid (33.6/3.5)
## : tax_liens <= 0:
## : :...openAccRatio > 0.8510638:
## : :...tot_hi_cred_lim <= 69237: Charged Off (34.7/10)
## : : tot_hi_cred_lim > 69237: Fully Paid (30.6/3.5)
## : openAccRatio <= 0.8510638:
## : :...num_actv_rev_tl <= 1: Fully Paid (160.1/10.6)
## : num_actv_rev_tl > 1:
## : :...revol_bal > 68177: Fully Paid (112.4/3.5)
## : revol_bal <= 68177:
## : :...mo_sin_old_rev_tl_op > 442:
## : :...open_acc > 13: Fully Paid (35.3/3.5)
## : : open_acc <= 13: [S34]
## : mo_sin_old_rev_tl_op <= 442:
## : :...loan_amnt > 28225: Fully Paid (64.2)
## : loan_amnt <= 28225: [S35]
## sub_grade in {A4,A5,B1,B2,B3,B4,B5}:
## :...tax_liens > 2: Charged Off (83/33.6)
## tax_liens <= 2:
## :...grade in {C,D,E,F,G}: Fully Paid (0)
## grade = B:
## :...home_ownership in {ANY,NONE,OTHER}: Fully Paid (0)
## : home_ownership = RENT:
## : :...delinq_2yrs > 2:
## : : :...revol_util <= 67.4: Fully Paid (45.3/3.5)
## : : : revol_util > 67.4: Charged Off (30.6/13)
## : : delinq_2yrs <= 2:
## : : :...mo_sin_rcnt_tl > 13:
## : : :...total_acc > 34: Fully Paid (37.1/3.5)
## : : : total_acc <= 34:
## : : : :...pub_rec > 0: [S36]
## : : : pub_rec <= 0:
## : : : :...mo_sin_rcnt_tl <= 14: Fully Paid (43.6/3.5)
## : : : mo_sin_rcnt_tl > 14:
## : : : :...int_rate <= 9.76: Fully Paid (214.3/49.5)
## : : : int_rate > 9.76:
## : : : :...revol_util <= 52.7: [S37]
## : : : revol_util > 52.7: [S38]
## : : mo_sin_rcnt_tl <= 13:
## : : :...num_bc_sats > 12: Fully Paid (37.1/7.1)
## : : num_bc_sats <= 12:
## : : :...annual_inc <= 28528: [S39]
## : : annual_inc > 28528:
## : : :...tot_coll_amt > 5238: Charged Off (45.9/10.6)
## : : tot_coll_amt <= 5238:
## : : :...mths_since_recent_bc > 43:
## : : :...annual_inc <= 38313: Fully Paid (33)
## : : : annual_inc > 38313: [S40]
## : : mths_since_recent_bc <= 43:
## : : :...pub_rec > 1: [S41]
## : : pub_rec <= 1:
## : : :...num_accts_ever_120_pd > 0:
## : : :...acc_open_past_24mths > 7: [S42]
## : : : acc_open_past_24mths <= 7: [S43]
## : : num_accts_ever_120_pd <= 0: [S44]
## : home_ownership in {MORTGAGE,OWN}:
## : :...inq_last_6mths > 4: Charged Off (30.6/9.4)
## : inq_last_6mths <= 4:
## : :...chargeoff_within_12_mths > 0:
## : :...mo_sin_old_rev_tl_op <= 196: Charged Off (36.5/8.2)
## : : mo_sin_old_rev_tl_op > 196: Fully Paid (35.3/10.6)
## : chargeoff_within_12_mths <= 0:
## : :...mo_sin_old_rev_tl_op <= 56:
## : :...annual_inc <= 38500: Charged Off (48.9/10)
## : : annual_inc > 38500:
## : : :...home_ownership = OWN: Fully Paid (14.7)
## : : home_ownership = MORTGAGE:
## : : :...num_il_tl <= 5: Fully Paid (37.1/10.6)
## : : num_il_tl > 5: Charged Off (52.4/17.1)
## : mo_sin_old_rev_tl_op > 56:
## : :...openAccRatio > 0.8148148: Fully Paid (87.7/10.6)
## : openAccRatio <= 0.8148148:
## : :...num_op_rev_tl <= 2: Fully Paid (126.6/21.2)
## : num_op_rev_tl > 2:
## : :...collections_12_mths_ex_med > 0: [S45]
## : collections_12_mths_ex_med <= 0:
## : :...num_accts_ever_120_pd > 4: [S46]
## : num_accts_ever_120_pd <= 4:
## : :...bc_util <= 1.7: Fully Paid (46.5/3.5)
## : bc_util > 1.7: [S47]
## grade = A:
## :...revol_bal > 85339: Fully Paid (57.7)
## revol_bal <= 85339:
## :...purpose in {car,moving,vacation}: Fully Paid (53.6/3.5)
## purpose = small_business: Charged Off (30.6/9.4)
## purpose in {credit_card,debt_consolidation,
## : home_improvement,house,major_purchase,medical,
## : other}:
## :...mort_acc > 6: Fully Paid (138.4/21.2)
## mort_acc <= 6:
## :...dti > 23.21:
## :...dti <= 23.42: Charged Off (58.3/15.9)
## : dti > 23.42:
## : :...installment <= 158.13: Charged Off (123.6/45.9)
## : installment > 158.13:
## : :...tot_hi_cred_lim <= 46486: Fully Paid (35.9)
## : tot_hi_cred_lim > 46486: [S48]
## dti <= 23.21:
## :...dti > 22.9: Fully Paid (38.3)
## dti <= 22.9:
## :...delinq_2yrs > 2: [S49]
## delinq_2yrs <= 2:
## :...mo_sin_rcnt_tl > 36: Fully Paid (83.6/7.1)
## mo_sin_rcnt_tl <= 36:
## :...mo_sin_old_rev_tl_op <= 80: [S50]
## mo_sin_old_rev_tl_op > 80: [S51]
##
## SubTree [S1]
##
## mo_sin_old_rev_tl_op > 187: Fully Paid (31.8/7.1)
## mo_sin_old_rev_tl_op <= 187:
## :...emp_length in {n/a,1 year,2 years,3 years,6 years}: Fully Paid (10.6)
## emp_length in {< 1 year,4 years,5 years,7 years,8 years,9 years,
## 10+ years}: Charged Off (117.2/28.8)
##
## SubTree [S2]
##
## mths_since_last_delinq <= 17: Charged Off (30/8.8)
## mths_since_last_delinq > 17: Fully Paid (47.1/3.5)
##
## SubTree [S3]
##
## mths_since_recent_inq > 11: Charged Off (42.4/3.5)
## mths_since_recent_inq <= 11:
## :...total_bc_limit <= 3400: Charged Off (52.4/6.5)
## total_bc_limit > 3400:
## :...acc_open_past_24mths <= 6: Fully Paid (54.2/21.2)
## acc_open_past_24mths > 6: Charged Off (40.6/8.8)
##
## SubTree [S4]
##
## num_op_rev_tl <= 2: Fully Paid (52.4/17.7)
## num_op_rev_tl > 2:
## :...tot_cur_bal <= 194286:
## :...int_rate <= 17.77: Fully Paid (55.3/10.6)
## : int_rate > 17.77: Charged Off (30/12.4)
## tot_cur_bal > 194286:
## :...home_ownership in {ANY,NONE,OTHER}: Charged Off (0)
## home_ownership = OWN: Fully Paid (46.5/17.7)
## home_ownership in {MORTGAGE,RENT}:
## :...total_rev_hi_lim <= 6150: Charged Off (66.5/10)
## total_rev_hi_lim > 6150:
## :...annual_inc <= 76000:
## :...initial_list_status = w: Charged Off (185.5/40.6)
## : initial_list_status = f:
## : :...propSatisBankcardAccts > 0.7619048: Fully Paid (33.6/7.1)
## : propSatisBankcardAccts <= 0.7619048:
## : :...int_rate <= 15.61: Fully Paid (45.3/21.2)
## : int_rate > 15.61: Charged Off (128.9/30)
## annual_inc > 76000:
## :...tot_coll_amt > 20: Fully Paid (44.7/10.6)
## tot_coll_amt <= 20:
## :...num_sats <= 10:
## :...grade in {A,B,D,E,G}: Fully Paid (63.6/17.7)
## : grade = F: Charged Off (3.5)
## : grade = C:
## : :...num_rev_tl_bal_gt_0 <= 3: Charged Off (45.9/14.1)
## : num_rev_tl_bal_gt_0 > 3: Fully Paid (52.4/17.7)
## num_sats > 10:
## :...num_bc_sats <= 3: Charged Off (69.5/13)
## num_bc_sats > 3: [S52]
##
## SubTree [S5]
##
## pct_tl_nvr_dlq <= 93.8: Fully Paid (49.5/14.1)
## pct_tl_nvr_dlq > 93.8: Charged Off (60.1/14.1)
##
## SubTree [S6]
##
## mths_since_last_delinq <= 39: Charged Off (335.6/88.3)
## mths_since_last_delinq > 39: Fully Paid (36.5/14.1)
##
## SubTree [S7]
##
## collections_12_mths_ex_med > 0: Charged Off (48.9/13.5)
## collections_12_mths_ex_med <= 0:
## :...acc_open_past_24mths <= 3:
## :...total_rev_hi_lim <= 2150: Charged Off (70.1/17.1)
## : total_rev_hi_lim > 2150:
## : :...annual_inc > 66500: Fully Paid (33)
## : annual_inc <= 66500:
## : :...emp_length in {< 1 year,1 year,3 years,4 years,5 years,6 years,
## : : 8 years}:
## : :...purpose in {car,home_improvement,house,
## : : : other}: Fully Paid (67.1/28.3)
## : : purpose in {credit_card,major_purchase,medical,moving,
## : : : small_business,
## : : : vacation}: Charged Off (116.6/38.9)
## : : purpose = debt_consolidation:
## : : :...annual_inc <= 29500: Charged Off (57.1/21.8)
## : : annual_inc > 29500: Fully Paid (39.4/7.1)
## : emp_length in {2 years,7 years,10+ years}:
## : :...home_ownership in {ANY,MORTGAGE,NONE,
## : : OTHER}: Fully Paid (28.3)
## : home_ownership in {OWN,RENT}:
## : :...bc_open_to_buy <= 1703: Fully Paid (65.9/7.1)
## : bc_open_to_buy > 1703: Charged Off (51.2/23)
## acc_open_past_24mths > 3:
## :...total_bal_ex_mort > 96013: Fully Paid (33/7.1)
## total_bal_ex_mort <= 96013:
## :...pub_rec_bankruptcies > 0:
## :...loan_amnt <= 3325: Fully Paid (44.7/7.1)
## : loan_amnt > 3325:
## : :...num_sats <= 8: Charged Off (44.7/9.4)
## : num_sats > 8: Fully Paid (37.7/17.7)
## pub_rec_bankruptcies <= 0:
## :...num_actv_rev_tl > 3:
## :...purpose in {car,major_purchase,medical,moving,
## : : other}: Charged Off (222.5/67.1)
## : purpose in {home_improvement,house,small_business,
## : : vacation}: Fully Paid (49.5/21.2)
## : purpose = credit_card:
## : :...home_ownership in {ANY,MORTGAGE,NONE,OTHER,
## : : : RENT}: Charged Off (60.6/21.8)
## : : home_ownership = OWN: Fully Paid (4.1)
## : purpose = debt_consolidation:
## : :...propSatisBankcardAccts <= 0.4347826: Fully Paid (32.4/10.6)
## : propSatisBankcardAccts > 0.4347826: Charged Off (250.8/81.2)
## num_actv_rev_tl <= 3:
## :...num_bc_sats > 3: Charged Off (33.6/8.8)
## num_bc_sats <= 3:
## :...total_bc_limit > 4350: Fully Paid (41.8/3.5)
## total_bc_limit <= 4350:
## :...total_rev_hi_lim <= 6252: Fully Paid (60.1/21.2)
## total_rev_hi_lim > 6252: Charged Off (55.9/13.5)
##
## SubTree [S8]
##
## verification_status in {Not Verified,Source Verified}: Charged Off (45.3/6.5)
## verification_status = Verified: Fully Paid (17.1/7.1)
##
## SubTree [S9]
##
## verification_status = Not Verified: Fully Paid (23/7.1)
## verification_status = Verified: Charged Off (90.7/30.6)
## verification_status = Source Verified:
## :...sub_grade in {C3,C4,C5,D1,D2,D3,D4,D5,E1,E4,F1,F2,G1,
## : G2}: Charged Off (46.5/14.7)
## sub_grade in {E2,E3,E5,F3,F4,F5,G3}: Fully Paid (15.9)
##
## SubTree [S10]
##
## purpose in {car,home_improvement,house,major_purchase,medical,small_business,
## : vacation}: Fully Paid (41.8/10.6)
## purpose in {moving,other}: Charged Off (53.6/21.8)
## purpose = credit_card:
## :...num_op_rev_tl <= 7: Fully Paid (44.2/10.6)
## : num_op_rev_tl > 7: Charged Off (56.5/24.7)
## purpose = debt_consolidation:
## :...annual_inc > 77550: Fully Paid (47.1/10.6)
## annual_inc <= 77550:
## :...verification_status in {Not Verified,Source Verified}:
## :...emp_length in {n/a,2 years,8 years,9 years,
## : : 10+ years}: Fully Paid (84.2/31.8)
## : emp_length in {< 1 year,1 year,3 years,4 years,5 years,6 years,
## : 7 years}: Charged Off (103.6/40)
## verification_status = Verified:
## :...num_rev_accts <= 7: Fully Paid (31.8/14.1)
## num_rev_accts > 7: Charged Off (128.9/26.5)
##
## SubTree [S11]
##
## purpose in {car,credit_card,moving,small_business}: Charged Off (70.1/24.1)
## purpose in {home_improvement,house,major_purchase,medical,other,
## : vacation}: Fully Paid (48.9/17.7)
## purpose = debt_consolidation:
## :...home_ownership = OWN: Fully Paid (6.5)
## home_ownership in {ANY,NONE,OTHER,RENT}: Charged Off (77.1/24.1)
## home_ownership = MORTGAGE:
## :...emp_length in {n/a,4 years,5 years,9 years}: Charged Off (58.9/13)
## emp_length in {< 1 year,1 year,2 years,3 years,6 years,7 years,
## : 8 years}: Fully Paid (24.1/7.1)
## emp_length = 10+ years:
## :...avg_cur_bal <= 14602: Fully Paid (31.8/10.6)
## avg_cur_bal > 14602: Charged Off (53/10.6)
##
## SubTree [S12]
##
## num_accts_ever_120_pd > 1: Fully Paid (74.8/35.3)
## num_accts_ever_120_pd <= 1:
## :...delinq_2yrs > 1: Charged Off (58.3/8.8)
## delinq_2yrs <= 1:
## :...purpose in {car,home_improvement,house,major_purchase,medical,other,
## : small_business,vacation}: Charged Off (280.2/82.4)
## purpose = moving: Fully Paid (5.3)
## purpose = credit_card:
## :...initial_list_status = w: Charged Off (41.8/10)
## : initial_list_status = f:
## : :...loan_amnt > 19750: Charged Off (39.4/7.7)
## : loan_amnt <= 19750: [S53]
## purpose = debt_consolidation:
## :...annual_inc <= 55000: Charged Off (426.2/129.5)
## annual_inc > 55000:
## :...sub_grade = D5: Fully Paid (31.8/7.1)
## sub_grade in {D2,D3,D4}:
## :...emp_length in {n/a,2 years,3 years,5 years,6 years,8 years,
## : 9 years}: Charged Off (102.4/35.3)
## emp_length in {< 1 year,1 year,4 years,
## : 7 years}: Fully Paid (31.2/10.6)
## emp_length = 10+ years:
## :...total_bal_ex_mort <= 30732: Charged Off (44.2/12.4)
## total_bal_ex_mort > 30732: Fully Paid (31.8/7.1)
##
## SubTree [S13]
##
## num_actv_rev_tl > 14: Charged Off (66.5/31.2)
## num_actv_rev_tl <= 14:
## :...num_tl_90g_dpd_24m > 0:
## :...num_accts_ever_120_pd <= 1: Charged Off (179.6/27.7)
## : num_accts_ever_120_pd > 1:
## : :...borrHistory <= 11.74812: Charged Off (36.5/4.7)
## : borrHistory > 11.74812: Fully Paid (33.6/14.1)
## num_tl_90g_dpd_24m <= 0:
## :...mths_since_recent_inq <= 1: Charged Off (783/182.5)
## mths_since_recent_inq > 1:
## :...purpose = car: Fully Paid (10/3.5)
## purpose in {credit_card,home_improvement,house,major_purchase,
## : medical,moving,small_business,
## : vacation}: Charged Off (463.9/124.8)
## purpose = other:
## :...borrHistory <= 12.33402: Charged Off (71.8/18.8)
## : borrHistory > 12.33402: Fully Paid (31.8/7.1)
## purpose = debt_consolidation:
## :...mort_acc <= 0:
## :...mths_since_last_delinq <= 13: Fully Paid (34.7/14.1)
## : mths_since_last_delinq > 13: Charged Off (816/219)
## mort_acc > 0:
## :...num_sats > 11: Charged Off (254.3/67.1)
## num_sats <= 11:
## :...home_ownership in {ANY,NONE,
## : OTHER}: Charged Off (0)
## home_ownership = RENT: Fully Paid (43.6/10.6)
## home_ownership in {MORTGAGE,OWN}:
## :...tot_cur_bal <= 83082: Charged Off (83.6/20)
## tot_cur_bal > 83082:
## :...dti <= 22.43: Fully Paid (32.4/7.1)
## dti > 22.43: Charged Off (30/12.4)
##
## SubTree [S14]
##
## purpose in {car,home_improvement,major_purchase,medical,small_business,
## : vacation}: Charged Off (227.8/83)
## purpose in {house,moving}: Fully Paid (30/14.1)
## purpose = other:
## :...mo_sin_old_il_acct <= 117: Fully Paid (38.9/7.1)
## : mo_sin_old_il_acct > 117:
## : :...int_rate <= 14.65: Charged Off (58.9/13)
## : int_rate > 14.65: Fully Paid (43/17.7)
## purpose = credit_card:
## :...mo_sin_rcnt_tl <= 3: Charged Off (156.6/43.6)
## : mo_sin_rcnt_tl > 3:
## : :...num_actv_bc_tl > 5: Fully Paid (34.1/7.1)
## : num_actv_bc_tl <= 5:
## : :...total_il_high_credit_limit <= 12190: Fully Paid (54.8/14.1)
## : total_il_high_credit_limit > 12190:
## : :...sub_grade in {C3,D1}: Charged Off (91.8/31.8)
## : sub_grade = C5: Fully Paid (17.1/3.5)
## : sub_grade = C4:
## : :...home_ownership in {ANY,MORTGAGE,NONE,OTHER,
## : : RENT}: Charged Off (65.3/12.4)
## : home_ownership = OWN: Fully Paid (2.9)
## purpose = debt_consolidation:
## :...num_accts_ever_120_pd > 0:
## :...num_rev_accts <= 4: Charged Off (31.2/2.9)
## : num_rev_accts > 4:
## : :...mo_sin_old_rev_tl_op > 137: Charged Off (241.4/71.8)
## : mo_sin_old_rev_tl_op <= 137:
## : :...mo_sin_old_rev_tl_op <= 109: Charged Off (47.7/19.4)
## : mo_sin_old_rev_tl_op > 109: Fully Paid (30.6/3.5)
## num_accts_ever_120_pd <= 0:
## :...total_acc > 28:
## :...num_op_rev_tl <= 8: Fully Paid (38.9/3.5)
## : num_op_rev_tl > 8:
## : :...sub_grade in {C3,C4,D1}: Charged Off (82.4/33)
## : sub_grade = C5: Fully Paid (11.2)
## total_acc <= 28:
## :...home_ownership in {ANY,NONE,OTHER}: Charged Off (0)
## home_ownership = OWN:
## :...total_bal_ex_mort <= 15265: Charged Off (42.4/14.1)
## : total_bal_ex_mort > 15265: Fully Paid (38.9/7.1)
## home_ownership in {MORTGAGE,RENT}:
## :...initial_list_status = w: Charged Off (384.4/126.6)
## initial_list_status = f:
## :...num_rev_accts <= 5: Fully Paid (54.2/17.7)
## num_rev_accts > 5:
## :...mths_since_recent_bc > 36:
## :...num_actv_rev_tl <= 5: Fully Paid (31.8/3.5)
## : num_actv_rev_tl > 5: Charged Off (35.3/14.1)
## mths_since_recent_bc <= 36:
## :...mo_sin_rcnt_rev_tl_op > 26: Charged Off (30.6/2.4)
## mo_sin_rcnt_rev_tl_op <= 26:
## :...open_acc <= 6: Charged Off (98.9/28.3)
## open_acc > 6:
## :...num_actv_bc_tl <= 3:
## :...revol_util <= 74.4: Fully Paid (31.2/3.5)
## : revol_util > 74.4: Charged Off (30/12.4)
## num_actv_bc_tl > 3:
## :...num_il_tl <= 3: Fully Paid (47.7/17.7)
## num_il_tl > 3: Charged Off (77.1/20.6)
##
## SubTree [S15]
##
## num_rev_accts > 34: Fully Paid (45.3/17.7)
## num_rev_accts <= 34:
## :...open_acc > 26: Charged Off (59.5/6.5)
## open_acc <= 26:
## :...tot_hi_cred_lim > 337079: Fully Paid (30.6/10.6)
## tot_hi_cred_lim <= 337079:
## :...delinq_2yrs <= 0: Charged Off (551.1/137.8)
## delinq_2yrs > 0:
## :...initial_list_status = f: Fully Paid (38.3/14.1)
## initial_list_status = w: Charged Off (44.2/8.8)
##
## SubTree [S16]
##
## total_il_high_credit_limit <= 4544: Fully Paid (75.9/24.7)
## total_il_high_credit_limit > 4544:
## :...purpose in {car,house,major_purchase,moving,small_business,
## : vacation}: Charged Off (81.8/25.3)
## purpose = medical: Fully Paid (7.1)
## purpose = home_improvement:
## :...num_bc_sats <= 3: Charged Off (50/7.7)
## : num_bc_sats > 3: Fully Paid (39.4/14.1)
## purpose = other:
## :...borrHistory <= 16.0794: Charged Off (59.5/10)
## : borrHistory > 16.0794: Fully Paid (30.6/14.1)
## purpose = credit_card:
## :...mths_since_last_delinq > 49: Charged Off (64.8/11.8)
## : mths_since_last_delinq <= 49:
## : :...num_rev_accts > 20: Charged Off (91.3/27.7)
## : num_rev_accts <= 20:
## : :...grade = D: Fully Paid (9.4)
## : grade = C:
## : :...percent_bc_gt_75 <= 93.7: Fully Paid (48.9/14.1)
## : percent_bc_gt_75 > 93.7: Charged Off (34.7/10)
## purpose = debt_consolidation:
## :...emp_length in {n/a,7 years,9 years}: Charged Off (153.1/57.7)
## emp_length in {5 years,6 years}: Fully Paid (76.5/28.3)
## emp_length = < 1 year:
## :...propSatisBankcardAccts <= 0.4571429: Charged Off (37.1/8.8)
## : propSatisBankcardAccts > 0.4571429: Fully Paid (34.1/14.1)
## emp_length = 1 year:
## :...borrHistory <= 15.17043: Fully Paid (31.2/14.1)
## : borrHistory > 15.17043: Charged Off (38.3/6.5)
## emp_length = 2 years:
## :...bc_util <= 75.1: Fully Paid (34.1/10.6)
## : bc_util > 75.1: Charged Off (61.8/12.4)
## emp_length = 3 years:
## :...mo_sin_old_il_acct <= 123: Charged Off (33/8.2)
## : mo_sin_old_il_acct > 123: Fully Paid (43/17.7)
## emp_length = 4 years:
## :...annual_inc <= 77500: Fully Paid (40/17.7)
## : annual_inc > 77500: Charged Off (37.7/5.9)
## emp_length = 8 years:
## :...verification_status in {Not Verified,
## : : Source Verified}: Fully Paid (25.3/10.6)
## : verification_status = Verified: Charged Off (35.9/7.7)
## emp_length = 10+ years:
## :...num_sats > 22: Charged Off (31.2/2.9)
## num_sats <= 22:
## :...tot_coll_amt > 0: Fully Paid (35.3/10.6)
## tot_coll_amt <= 0:
## :...propSatisBankcardAccts > 0.72: Fully Paid (40/14.1)
## propSatisBankcardAccts <= 0.72:
## :...inq_last_6mths > 2: Charged Off (56.5/10.6)
## inq_last_6mths <= 2:
## :...home_ownership in {ANY,NONE,OTHER,OWN,
## : RENT}: Charged Off (127.2/35.3)
## home_ownership = MORTGAGE: [S54]
##
## SubTree [S17]
##
## num_tl_90g_dpd_24m > 0: Charged Off (167.2/40)
## num_tl_90g_dpd_24m <= 0:
## :...mo_sin_old_rev_tl_op > 460: Charged Off (38.3/2.9)
## mo_sin_old_rev_tl_op <= 460:
## :...num_rev_accts <= 6:
## :...inq_last_6mths > 1: Charged Off (85.4/21.8)
## : inq_last_6mths <= 1:
## : :...emp_length in {2 years,8 years}: Fully Paid (26.5)
## : emp_length in {n/a,< 1 year,1 year,3 years,5 years,9 years}:
## : :...num_actv_rev_tl <= 3: Charged Off (95.4/24.7)
## : : num_actv_rev_tl > 3: Fully Paid (55.3/24.7)
## : emp_length in {4 years,6 years,7 years,10+ years}:
## : :...tot_cur_bal <= 34813: Charged Off (53.6/21.8)
## : tot_cur_bal > 34813: Fully Paid (37.7/3.5)
## num_rev_accts > 6:
## :...mo_sin_old_il_acct <= 75:
## :...num_il_tl > 4: Charged Off (293.8/57.1)
## : num_il_tl <= 4:
## : :...num_il_tl <= 2: Charged Off (402.7/109.5)
## : num_il_tl > 2:
## : :...home_ownership in {ANY,MORTGAGE,NONE,
## : : OTHER}: Charged Off (55.3/16.5)
## : home_ownership = OWN: Fully Paid (9.4)
## : home_ownership = RENT:
## : :...acc_open_past_24mths <= 6: Fully Paid (46.5/17.7)
## : acc_open_past_24mths > 6: Charged Off (72.4/15.9)
## mo_sin_old_il_acct > 75:
## :...num_bc_tl > 11: Charged Off (177.2/39.4)
## num_bc_tl <= 11:
## :...annual_inc <= 57388: Charged Off (1013.2/338.5)
## annual_inc > 57388:
## :...acc_open_past_24mths <= 4:
## :...total_rev_hi_lim > 20400: Fully Paid (50.6/3.5)
## : total_rev_hi_lim <= 20400:
## : :...purpose in {car,major_purchase,medical,
## : : vacation}: Charged Off (0)
## : purpose in {credit_card,moving,other,
## : : small_business}: Fully Paid (9.4)
## : purpose in {debt_consolidation,
## : : home_improvement,house}:
## : :...grade = C: Charged Off (58.3/15.9)
## : grade = D: Fully Paid (12.4/3.5)
## acc_open_past_24mths > 4:
## :...open_acc <= 7: Fully Paid (33/10.6)
## open_acc > 7:
## :...mths_since_last_delinq > 53: Fully Paid (33/10.6)
## mths_since_last_delinq <= 53:
## :...num_bc_sats > 6: Fully Paid (48.9/21.2)
## num_bc_sats <= 6: [S55]
##
## SubTree [S18]
##
## home_ownership in {ANY,NONE,OTHER}: Charged Off (0)
## home_ownership = RENT:
## :...mths_since_recent_bc <= 4: Charged Off (93/18.8)
## : mths_since_recent_bc > 4:
## : :...pct_tl_nvr_dlq <= 96.5: Fully Paid (35.9/7.1)
## : pct_tl_nvr_dlq > 96.5:
## : :...acc_open_past_24mths > 8: Charged Off (52.4/10)
## : acc_open_past_24mths <= 8:
## : :...num_bc_tl <= 10: Fully Paid (31.8/7.1)
## : num_bc_tl > 10: Charged Off (34.1/9.4)
## home_ownership in {MORTGAGE,OWN}:
## :...num_tl_op_past_12m <= 2: Fully Paid (64.2/10.6)
## num_tl_op_past_12m > 2:
## :...borrHistory > 20.16975: Fully Paid (31.2/3.5)
## borrHistory <= 20.16975:
## :...total_bc_limit > 28300: Fully Paid (31.2/7.1)
## total_bc_limit <= 28300:
## :...revol_util <= 42.6: Charged Off (100.7/26.5)
## revol_util > 42.6:
## :...acc_open_past_24mths <= 8: Fully Paid (31.2/3.5)
## acc_open_past_24mths > 8: Charged Off (32.4/11.2)
##
## SubTree [S19]
##
## purpose in {car,home_improvement,major_purchase,other,small_business,
## : vacation}: Charged Off (150.7/37.7)
## purpose in {medical,moving}: Fully Paid (3.5)
## purpose = credit_card:
## :...mort_acc <= 0: Charged Off (191.3/35.9)
## : mort_acc > 0:
## : :...verification_status in {Not Verified,Verified}: Fully Paid (22.4/7.1)
## : verification_status = Source Verified: Charged Off (44.7/9.4)
## purpose = debt_consolidation:
## :...mths_since_recent_bc > 6: Charged Off (431.5/106.6)
## mths_since_recent_bc <= 6:
## :...total_rev_hi_lim <= 16025: Fully Paid (34.7/7.1)
## total_rev_hi_lim > 16025:
## :...verification_status = Not Verified: Charged Off (79.5/15.9)
## verification_status in {Source Verified,Verified}:
## :...pct_tl_nvr_dlq <= 97: Charged Off (93/25.9)
## pct_tl_nvr_dlq > 97: Fully Paid (50/17.7)
##
## SubTree [S20]
##
## int_rate > 14.49: Charged Off (58.9/13)
## int_rate <= 14.49:
## :...purpose in {car,home_improvement,major_purchase,medical,
## : other}: Fully Paid (25.3/7.1)
## purpose in {credit_card,moving,small_business,
## : vacation}: Charged Off (106.6/39.4)
## purpose = debt_consolidation:
## :...emp_length in {n/a,< 1 year,1 year,6 years,7 years,
## : 8 years}: Charged Off (134.2/45.9)
## emp_length in {2 years,3 years,4 years,5 years,
## : 9 years}: Fully Paid (58.9/21.2)
## emp_length = 10+ years:
## :...num_sats <= 13: Charged Off (36.5/11.8)
## num_sats > 13: Fully Paid (33.6/10.6)
##
## SubTree [S21]
##
## home_ownership in {ANY,NONE,OTHER,RENT}: Fully Paid (9.4)
## home_ownership in {MORTGAGE,OWN}:
## :...total_bal_ex_mort > 155000: Charged Off (30/5.3)
## total_bal_ex_mort <= 155000:
## :...initial_list_status = f: Fully Paid (81.8/14.1)
## initial_list_status = w:
## :...purpose in {car,credit_card,home_improvement,moving,other,
## : vacation}: Fully Paid (35.3/10.6)
## purpose in {debt_consolidation,major_purchase,medical,
## small_business}: Charged Off (62.4/20)
##
## SubTree [S22]
##
## propSatisBankcardAccts <= 0.4210526: Fully Paid (45.9/10.6)
## propSatisBankcardAccts > 0.4210526:
## :...open_acc <= 10: Charged Off (36.5/8.2)
## open_acc > 10: Fully Paid (32.4/14.1)
##
## SubTree [S23]
##
## mths_since_recent_bc > 19: Fully Paid (37.1/14.1)
## mths_since_recent_bc <= 19:
## :...delinq_2yrs > 0:
## :...emp_length in {< 1 year,3 years,5 years,7 years,
## : : 9 years}: Fully Paid (5.9)
## : emp_length in {n/a,1 year,2 years,4 years,6 years,8 years,
## : 10+ years}: Charged Off (81.8/11.2)
## delinq_2yrs <= 0:
## :...purpose in {car,credit_card,major_purchase,moving,other,
## : small_business}: Charged Off (93/22.4)
## purpose in {home_improvement,medical,vacation}: Fully Paid (13.5/3.5)
## purpose = debt_consolidation:
## :...mths_since_last_delinq > 58: Charged Off (37.7/5.9)
## mths_since_last_delinq <= 58:
## :...total_bc_limit <= 24100: Charged Off (83/30)
## total_bc_limit > 24100: Fully Paid (30/7.1)
##
## SubTree [S24]
##
## acc_open_past_24mths > 1: Charged Off (46.5/14.7)
## acc_open_past_24mths <= 1:
## :...num_actv_rev_tl <= 3: Fully Paid (40/7.1)
## num_actv_rev_tl > 3:
## :...bc_open_to_buy <= 1107: Charged Off (34.7/10)
## bc_open_to_buy > 1107: Fully Paid (31.2/10.6)
##
## SubTree [S25]
##
## home_ownership in {ANY,NONE,OTHER}: Charged Off (0)
## home_ownership in {OWN,RENT}: Fully Paid (40/14.1)
## home_ownership = MORTGAGE:
## :...avg_cur_bal > 41251: Charged Off (41.2/5.9)
## avg_cur_bal <= 41251:
## :...tot_cur_bal > 368070: Fully Paid (34.1/7.1)
## tot_cur_bal <= 368070:
## :...open_acc > 11: Charged Off (282/98.3)
## open_acc <= 11:
## :...bc_open_to_buy <= 4246: Fully Paid (36.5/3.5)
## bc_open_to_buy > 4246: Charged Off (45.3/17.1)
##
## SubTree [S26]
##
## purpose in {car,medical,moving,small_business}: Fully Paid (31.8)
## purpose in {credit_card,debt_consolidation,home_improvement,major_purchase,
## : other,vacation}:
## :...dti > 23.75: Fully Paid (63.6/7.1)
## dti <= 23.75:
## :...num_accts_ever_120_pd > 4: Fully Paid (40.6/7.1)
## num_accts_ever_120_pd <= 4:
## :...total_bal_ex_mort <= 10004: Charged Off (84.8/28.3)
## total_bal_ex_mort > 10004:
## :...mo_sin_rcnt_tl <= 1:
## :...emp_length in {n/a,1 year,5 years,
## : : 8 years}: Fully Paid (13)
## : emp_length in {< 1 year,2 years,3 years,4 years,6 years,
## : : 7 years,9 years,10+ years}:
## : :...total_acc > 31: Fully Paid (34.1/10.6)
## : total_acc <= 31:
## : :...mo_sin_old_il_acct <= 130: Fully Paid (31.2/14.1)
## : mo_sin_old_il_acct > 130: Charged Off (77.1/13.5)
## mo_sin_rcnt_tl > 1:
## :...mo_sin_old_rev_tl_op > 360: Fully Paid (37.7)
## mo_sin_old_rev_tl_op <= 360:
## :...num_bc_sats > 10: Charged Off (39.4/11.2)
## num_bc_sats <= 10:
## :...num_bc_tl > 15: Fully Paid (55.3/7.1)
## num_bc_tl <= 15:
## :...mo_sin_old_rev_tl_op <= 94: Fully Paid (74.8/10.6)
## mo_sin_old_rev_tl_op > 94: [S56]
##
## SubTree [S27]
##
## purpose in {car,major_purchase,medical,moving,other}: Fully Paid (103.6/35.3)
## purpose in {small_business,vacation}: Charged Off (36.5/11.8)
## purpose = home_improvement:
## :...bc_util <= 71.5: Fully Paid (30.6/3.5)
## : bc_util > 71.5: Charged Off (35.3/10.6)
## purpose = credit_card:
## :...tot_cur_bal <= 27040: Fully Paid (35.3)
## : tot_cur_bal > 27040:
## : :...emp_length in {1 year,5 years,7 years,9 years}: Fully Paid (26.5)
## : emp_length in {n/a,< 1 year,2 years,3 years,4 years,6 years,8 years,
## : : 10+ years}:
## : :...open_acc > 13: Fully Paid (33/7.1)
## : open_acc <= 13:
## : :...num_actv_rev_tl <= 4: Fully Paid (30/7.1)
## : num_actv_rev_tl > 4: Charged Off (87.1/34.1)
## purpose = debt_consolidation:
## :...tax_liens > 0: Charged Off (49.5/17.7)
## tax_liens <= 0:
## :...mo_sin_old_rev_tl_op > 305: Fully Paid (33.6/3.5)
## mo_sin_old_rev_tl_op <= 305:
## :...emp_length = 1 year: Charged Off (63.6/17.7)
## emp_length in {n/a,< 1 year,2 years,3 years,4 years,5 years,
## : 6 years,7 years,8 years,9 years,10+ years}:
## :...int_rate > 12.99:
## :...grade in {A,B,D,E,F,G}: Fully Paid (34.1)
## : grade = C:
## : :...emp_length in {n/a,2 years,4 years,6 years,
## : : 7 years}: Fully Paid (21.2)
## : emp_length in {< 1 year,3 years,5 years,8 years,
## : : 9 years,10+ years}:
## : :...percent_bc_gt_75 <= 62.5: Fully Paid (32.4/7.1)
## : percent_bc_gt_75 > 62.5: Charged Off (57.7/22.4)
## int_rate <= 12.99:
## :...pub_rec_bankruptcies > 0: Fully Paid (34.1/7.1)
## pub_rec_bankruptcies <= 0:
## :...num_tl_op_past_12m <= 0: Fully Paid (49.5/14.1)
## num_tl_op_past_12m > 0:
## :...mths_since_recent_inq <= 1: Charged Off (69.5/20)
## mths_since_recent_inq > 1:
## :...emp_length in {n/a,< 1 year,2 years,3 years,
## : 5 years,
## : 8 years}: Fully Paid (81.2/28.3)
## emp_length in {4 years,6 years,7 years,
## : 9 years}: Charged Off (67.1/24.7)
## emp_length = 10+ years:
## :...openAccRatio <= 0.5769231: Fully Paid (35.9/3.5)
## openAccRatio > 0.5769231: Charged Off (30/5.3)
##
## SubTree [S28]
##
## bc_open_to_buy <= 5516: Fully Paid (65.9/10.6)
## bc_open_to_buy > 5516: Charged Off (48.9/10)
##
## SubTree [S29]
##
## purpose in {car,home_improvement,medical,
## : small_business}: Charged Off (70.1/20.6)
## purpose in {major_purchase,moving,other,vacation}: Fully Paid (49.5/21.2)
## purpose = credit_card:
## :...dti > 19.09: Charged Off (260.2/69.5)
## : dti <= 19.09:
## : :...home_ownership in {ANY,MORTGAGE,NONE,OTHER}: Fully Paid (28.3/3.5)
## : home_ownership in {OWN,RENT}:
## : :...num_bc_tl <= 7: Charged Off (35.9/7.7)
## : num_bc_tl > 7:
## : :...loan_amnt <= 11875: Fully Paid (30/7.1)
## : loan_amnt > 11875: Charged Off (30/12.4)
## purpose = debt_consolidation:
## :...propSatisBankcardAccts > 0.875:
## :...mths_since_recent_inq <= 4: Charged Off (37.1/12.4)
## : mths_since_recent_inq > 4: Fully Paid (50.6/7.1)
## propSatisBankcardAccts <= 0.875:
## :...verification_status = Verified: Charged Off (262/74.8)
## verification_status in {Not Verified,Source Verified}:
## :...percent_bc_gt_75 <= 25: Fully Paid (54.2/14.1)
## percent_bc_gt_75 > 25:
## :...emp_length in {n/a,3 years,4 years,6 years,
## : 9 years}: Fully Paid (45.9/10.6)
## emp_length in {< 1 year,1 year,2 years,5 years,7 years,8 years,
## : 10+ years}:
## :...mo_sin_rcnt_rev_tl_op <= 6:
## :...sub_grade in {B3,B5,C1,C2,G4}: Charged Off (150.1/33.6)
## : sub_grade = B4: Fully Paid (11.8/3.5)
## mo_sin_rcnt_rev_tl_op > 6:
## :...mo_sin_old_il_acct <= 124: Fully Paid (39.4/10.6)
## mo_sin_old_il_acct > 124: Charged Off (106.6/35.9)
##
## SubTree [S30]
##
## loan_amnt <= 5125: Fully Paid (32.4)
## loan_amnt > 5125:
## :...num_accts_ever_120_pd <= 0: Fully Paid (131.9/31.8)
## num_accts_ever_120_pd > 0:
## :...annual_inc <= 35500: Charged Off (30/5.3)
## annual_inc > 35500: Fully Paid (51.8/17.7)
##
## SubTree [S31]
##
## purpose in {car,medical,moving,vacation}: Fully Paid (5.3)
## purpose in {home_improvement,major_purchase,other,
## : small_business}: Charged Off (41.8/13.5)
## purpose = credit_card:
## :...mths_since_recent_inq <= 3: Charged Off (31.2/10)
## : mths_since_recent_inq > 3: Fully Paid (31.2/10.6)
## purpose = debt_consolidation:
## :...mths_since_recent_inq <= 3: Fully Paid (44.7/17.7)
## mths_since_recent_inq > 3:
## :...int_rate <= 13.05: Charged Off (136/23)
## int_rate > 13.05: Fully Paid (44.2/17.7)
##
## SubTree [S32]
##
## pub_rec > 1: Fully Paid (59.5/17.7)
## pub_rec <= 1:
## :...num_il_tl > 15:
## :...total_acc <= 28: Charged Off (58.9/27.1)
## : total_acc > 28: Fully Paid (42.4/7.1)
## num_il_tl <= 15:
## :...openAccRatio > 0.7068965:
## :...purpose in {car,home_improvement,medical,other,
## : : small_business}: Charged Off (65.9/23.5)
## : purpose in {major_purchase,moving,
## : : vacation}: Fully Paid (13/3.5)
## : purpose = credit_card:
## : :...home_ownership in {ANY,MORTGAGE,NONE,OTHER,
## : : : RENT}: Charged Off (118.9/51.8)
## : : home_ownership = OWN: Fully Paid (8.2)
## : purpose = debt_consolidation:
## : :...tot_hi_cred_lim > 76339: Fully Paid (37.1)
## : tot_hi_cred_lim <= 76339:
## : :...openAccRatio <= 0.7666667: Fully Paid (40.6/7.1)
## : openAccRatio > 0.7666667:
## : :...acc_open_past_24mths > 5: Charged Off (38.3/10)
## : acc_open_past_24mths <= 5:
## : :...num_tl_op_past_12m <= 1: Charged Off (45.3/17.1)
## : num_tl_op_past_12m > 1: Fully Paid (49.5/14.1)
## openAccRatio <= 0.7068965:
## :...annual_inc <= 29925:
## :...int_rate <= 13.66: Charged Off (256.7/65.9)
## : int_rate > 13.66: Fully Paid (34.1/14.1)
## annual_inc > 29925:
## :...num_accts_ever_120_pd > 2:
## :...emp_length in {n/a,2 years,4 years,
## : : 5 years}: Fully Paid (8.2)
## : emp_length in {< 1 year,1 year,3 years,6 years,7 years,
## : 8 years,9 years,
## : 10+ years}: Charged Off (86.5/19.4)
## num_accts_ever_120_pd <= 2:
## :...propSatisBankcardAccts <= 0.368421:
## :...bc_open_to_buy <= 65: Charged Off (32.4/7.7)
## : bc_open_to_buy > 65: Fully Paid (147.2/38.9)
## propSatisBankcardAccts > 0.368421:
## :...verification_status = Verified:
## :...purpose in {car,home_improvement,
## : : major_purchase}: Fully Paid (8.8)
## : purpose in {medical,moving,other,small_business,
## : : vacation}: Charged Off (30/8.8)
## : purpose = credit_card:
## : :...initial_list_status = f: Fully Paid (44.7/14.1)
## : : initial_list_status = w: Charged Off (41.2/5.9)
## : purpose = debt_consolidation:
## : :...mths_since_last_delinq > 64: Charged Off (30/1.8)
## : mths_since_last_delinq <= 64: [S57]
## verification_status in {Not Verified,Source Verified}:
## :...mths_since_recent_inq > 20: Charged Off (43/7.7)
## mths_since_recent_inq <= 20:
## :...num_bc_sats > 3:
## :...pct_tl_nvr_dlq <= 93.8:
## : :...num_il_tl > 6: Charged Off (94.8/24.1)
## : : num_il_tl <= 6: [S58]
## : pct_tl_nvr_dlq > 93.8:
## : :...num_bc_tl <= 5: Fully Paid (32.4/3.5)
## : num_bc_tl > 5: [S59]
## num_bc_sats <= 3: [S60]
##
## SubTree [S33]
##
## mths_since_last_delinq > 50:
## :...loan_amnt <= 6125: Fully Paid (60.6)
## : loan_amnt > 6125:
## : :...total_bc_limit <= 5150: Fully Paid (35.9/3.5)
## : total_bc_limit > 5150:
## : :...mths_since_last_delinq <= 60: Fully Paid (39.4/7.1)
## : mths_since_last_delinq > 60:
## : :...installment <= 337.17: Charged Off (71.8/18.8)
## : installment > 337.17: Fully Paid (55.3/17.7)
## mths_since_last_delinq <= 50:
## :...num_bc_sats > 6:
## :...home_ownership in {ANY,NONE,OTHER}: Charged Off (0)
## : home_ownership = MORTGAGE: Fully Paid (15.3/3.5)
## : home_ownership in {OWN,RENT}:
## : :...total_rev_hi_lim <= 21900: Fully Paid (35.3/14.1)
## : total_rev_hi_lim > 21900: Charged Off (104.8/23.5)
## num_bc_sats <= 6:
## :...revol_util > 86.7:
## :...total_bc_limit > 20520: Fully Paid (42.4/10.6)
## : total_bc_limit <= 20520:
## : :...mo_sin_old_rev_tl_op > 197: Charged Off (130.1/24.1)
## : mo_sin_old_rev_tl_op <= 197:
## : :...num_bc_tl <= 2: Charged Off (65.9/16.5)
## : num_bc_tl > 2:
## : :...inq_last_6mths > 0: Charged Off (85.4/28.8)
## : inq_last_6mths <= 0:
## : :...int_rate <= 12.12: Fully Paid (38.3/7.1)
## : int_rate > 12.12:
## : :...num_rev_accts <= 6: Fully Paid (37.7/10.6)
## : num_rev_accts > 6: Charged Off (111.3/44.2)
## revol_util <= 86.7:
## :...mths_since_recent_bc <= 3: Charged Off (221.4/83.6)
## mths_since_recent_bc > 3:
## :...num_tl_op_past_12m > 1:
## :...mths_since_recent_inq > 17: Charged Off (38.9/7.1)
## : mths_since_recent_inq <= 17:
## : :...num_actv_bc_tl > 4: Charged Off (72.4/19.4)
## : num_actv_bc_tl <= 4:
## : :...total_bc_limit <= 1900: Charged Off (62.4/16.5)
## : total_bc_limit > 1900:
## : :...total_rev_hi_lim > 30400: Charged Off (45.9/14.1)
## : total_rev_hi_lim <= 30400:
## : :...total_rev_hi_lim <= 6550: Charged Off (54.8/19.4)
## : total_rev_hi_lim > 6550:
## : :...tot_hi_cred_lim > 94792: Fully Paid (49.5/3.5)
## : tot_hi_cred_lim <= 94792: [S61]
## num_tl_op_past_12m <= 1:
## :...mo_sin_rcnt_rev_tl_op > 51: Charged Off (38.9/10.6)
## mo_sin_rcnt_rev_tl_op <= 51:
## :...pct_tl_nvr_dlq <= 84.1: Fully Paid (153.1/35.3)
## pct_tl_nvr_dlq > 84.1:
## :...pub_rec > 1: Charged Off (46.5/18.3)
## pub_rec <= 1:
## :...revol_util <= 33.9: Fully Paid (97.7/14.1)
## revol_util > 33.9:
## :...delinq_2yrs > 0: Charged Off (171.3/79.5)
## delinq_2yrs <= 0:
## :...emp_length in {n/a,< 1 year,1 year,
## : 3 years,5 years,9 years}: [S62]
## emp_length in {2 years,4 years,6 years,
## : 7 years,8 years,
## : 10+ years}: [S63]
##
## SubTree [S34]
##
## num_actv_bc_tl <= 3: Fully Paid (38.9/14.1)
## num_actv_bc_tl > 3: Charged Off (35.9/11.2)
##
## SubTree [S35]
##
## sub_grade in {A1,C2,C3,C5}: Fully Paid (1136.3/148.4)
## sub_grade in {A2,A3}:
## :...mo_sin_rcnt_tl > 27: Fully Paid (97.7/3.5)
## mo_sin_rcnt_tl <= 27:
## :...mo_sin_old_rev_tl_op > 369: Fully Paid (73/3.5)
## mo_sin_old_rev_tl_op <= 369:
## :...num_actv_rev_tl > 7:
## :...emp_length in {n/a,< 1 year,1 year,2 years,3 years,5 years,
## : : 6 years,7 years,
## : : 9 years}: Fully Paid (166/45.9)
## : emp_length in {4 years,8 years}: Charged Off (54.8/26.5)
## : emp_length = 10+ years:
## : :...mo_sin_rcnt_tl <= 1: Charged Off (31.2/10)
## : mo_sin_rcnt_tl > 1:
## : :...open_acc <= 17: Fully Paid (65.9/3.5)
## : open_acc > 17:
## : :...propSatisBankcardAccts <= 0.6896552: Fully Paid (34.1/7.1)
## : propSatisBankcardAccts > 0.6896552: Charged Off (38.9/10.6)
## num_actv_rev_tl <= 7:
## :...acc_open_past_24mths > 8: Fully Paid (49.5)
## acc_open_past_24mths <= 8:
## :...delinq_2yrs > 1:
## :...propSatisBankcardAccts <= 0.4583333: Charged Off (30/12.4)
## : propSatisBankcardAccts > 0.4583333: Fully Paid (31.8/7.1)
## delinq_2yrs <= 1:
## :...bc_open_to_buy > 62460: Fully Paid (39.4)
## bc_open_to_buy <= 62460:
## :...total_bc_limit <= 4150: Fully Paid (42.4)
## total_bc_limit > 4150: [S64]
##
## SubTree [S36]
##
## purpose in {car,house,moving,other,vacation}: Charged Off (14.7/4.1)
## purpose in {credit_card,home_improvement,major_purchase,medical,
## : small_business}: Fully Paid (34.7/14.1)
## purpose = debt_consolidation:
## :...total_rev_hi_lim <= 12700: Charged Off (46.5/11.2)
## total_rev_hi_lim > 12700: Fully Paid (32.4/10.6)
##
## SubTree [S37]
##
## borrHistory <= 9.330595: Charged Off (30/12.4)
## borrHistory > 9.330595: Fully Paid (49.5)
##
## SubTree [S38]
##
## num_rev_accts > 14: Charged Off (65.3/15.9)
## num_rev_accts <= 14:
## :...num_actv_bc_tl <= 1: Charged Off (35.3/10.6)
## num_actv_bc_tl > 1: Fully Paid (183.7/77.7)
##
## SubTree [S39]
##
## purpose in {car,house,major_purchase,medical}: Fully Paid (7.7)
## purpose in {home_improvement,moving,other,small_business,
## : vacation}: Charged Off (15.3/1.2)
## purpose = credit_card:
## :...num_rev_accts <= 15: Fully Paid (53.6/21.2)
## : num_rev_accts > 15: Charged Off (35.9/4.1)
## purpose = debt_consolidation:
## :...verification_status in {Not Verified,Verified}: Charged Off (118.9/34.1)
## verification_status = Source Verified: Fully Paid (32.4/14.1)
##
## SubTree [S40]
##
## total_bc_limit > 14315: Fully Paid (88.9/14.1)
## total_bc_limit <= 14315:
## :...purpose in {car,credit_card,home_improvement,house,major_purchase,other,
## : small_business,vacation}: Fully Paid (61.8/24.7)
## purpose in {medical,moving}: Charged Off (7.7/0.6)
## purpose = debt_consolidation:
## :...mo_sin_rcnt_rev_tl_op <= 8: Fully Paid (35.9/7.1)
## mo_sin_rcnt_rev_tl_op > 8: Charged Off (93.6/37.1)
##
## SubTree [S41]
##
## revol_util <= 29.8: Charged Off (30/12.4)
## revol_util > 29.8: Fully Paid (31.2)
##
## SubTree [S42]
##
## percent_bc_gt_75 <= 5: Charged Off (33.6/12.4)
## percent_bc_gt_75 > 5: Fully Paid (50.6/3.5)
##
## SubTree [S43]
##
## loan_amnt > 6900: Charged Off (500.4/179)
## loan_amnt <= 6900:
## :...openAccRatio <= 0.3731343: Fully Paid (31.8)
## openAccRatio > 0.3731343: Charged Off (94.2/44.7)
##
## SubTree [S44]
##
## total_bc_limit > 86900: Charged Off (30/5.3)
## total_bc_limit <= 86900:
## :...installment > 798.6: Fully Paid (60.1/10.6)
## installment <= 798.6:
## :...num_bc_tl > 11:
## :...mths_since_recent_bc > 22: Fully Paid (30.6/7.1)
## : mths_since_recent_bc <= 22:
## : :...num_bc_sats <= 5: Charged Off (134.8/32.4)
## : num_bc_sats > 5:
## : :...tot_cur_bal <= 22098: Fully Paid (33/3.5)
## : tot_cur_bal > 22098: Charged Off (191.9/78.9)
## num_bc_tl <= 11:
## :...purpose in {car,home_improvement,house,medical,other,
## : vacation}: Fully Paid (93/31.8)
## purpose in {major_purchase,moving,
## : small_business}: Charged Off (51.8/20)
## purpose = debt_consolidation:
## :...total_il_high_credit_limit > 64830:
## : :...num_il_tl <= 14: Fully Paid (43)
## : : num_il_tl > 14:
## : : :...tot_hi_cred_lim <= 144997: Fully Paid (33/3.5)
## : : tot_hi_cred_lim > 144997: Charged Off (30/12.4)
## : total_il_high_credit_limit <= 64830:
## : :...num_actv_rev_tl > 7:
## : :...mths_since_recent_bc <= 13: Charged Off (107.1/33)
## : : mths_since_recent_bc > 13: Fully Paid (31.8/10.6)
## : num_actv_rev_tl <= 7:
## : :...emp_length in {n/a,4 years}: Charged Off (90.1/33.6)
## : emp_length in {< 1 year,3 years,5 years,6 years,
## : : 7 years,8 years,
## : : 9 years}: Fully Paid (252.6/88.3)
## : emp_length = 1 year:
## : :...dti <= 22.09: Fully Paid (38.3/14.1)
## : : dti > 22.09: Charged Off (34.1/5.9)
## : emp_length = 2 years:
## : :...mths_since_recent_bc <= 11: Fully Paid (33/7.1)
## : : mths_since_recent_bc > 11: Charged Off (57.1/21.8)
## : emp_length = 10+ years:
## : :...initial_list_status = f: Fully Paid (54.8/14.1)
## : initial_list_status = w:
## : :...total_bal_ex_mort <= 37509: Fully Paid (51.8/17.7)
## : total_bal_ex_mort > 37509: Charged Off (38.3/6.5)
## purpose = credit_card:
## :...loan_amnt <= 4900: Fully Paid (31.2/3.5)
## loan_amnt > 4900:
## :...revol_bal <= 5090: Charged Off (74.2/17.7)
## revol_bal > 5090:
## :...borrHistory > 14.08077:
## :...total_rev_hi_lim <= 17200: Fully Paid (31.2/7.1)
## : total_rev_hi_lim > 17200:
## : :...num_op_rev_tl <= 9: Charged Off (132.5/44.2)
## : num_op_rev_tl > 9: Fully Paid (31.8/10.6)
## borrHistory <= 14.08077:
## :...mo_sin_old_rev_tl_op > 139: Fully Paid (34.7)
## mo_sin_old_rev_tl_op <= 139:
## :...sub_grade in {A4,A5,B1}: Fully Paid (44.7/3.5)
## sub_grade in {B2,B3,B4,B5}:
## :...revol_util > 69: Fully Paid (34.1/3.5)
## revol_util <= 69:
## :...revol_util <= 51.6: Fully Paid (60.1/21.2)
## revol_util > 51.6: Charged Off (74.8/21.8)
##
## SubTree [S45]
##
## bc_open_to_buy > 6876: Fully Paid (33/3.5)
## bc_open_to_buy <= 6876:
## :...emp_length in {n/a,1 year,2 years,4 years,7 years,8 years,
## : 9 years}: Fully Paid (9.4)
## emp_length in {< 1 year,3 years,5 years,6 years,
## 10+ years}: Charged Off (71.8/18.8)
##
## SubTree [S46]
##
## tot_hi_cred_lim > 210973: Fully Paid (53/10.6)
## tot_hi_cred_lim <= 210973:
## :...borrHistory <= 18.16564: Charged Off (53.6/7.7)
## borrHistory > 18.16564: Fully Paid (39.4/17.7)
##
## SubTree [S47]
##
## tax_liens > 0: Fully Paid (116.6/28.3)
## tax_liens <= 0:
## :...revol_util > 98.1: Charged Off (46.5/14.7)
## revol_util <= 98.1:
## :...num_rev_accts <= 7:
## :...num_bc_sats <= 1: Fully Paid (31.2)
## : num_bc_sats > 1:
## : :...num_op_rev_tl <= 3: Fully Paid (63.6/3.5)
## : num_op_rev_tl > 3:
## : :...num_op_rev_tl > 4: Fully Paid (124.2/17.7)
## : num_op_rev_tl <= 4:
## : :...percent_bc_gt_75 <= 58.3: Fully Paid (52.4/14.1)
## : percent_bc_gt_75 > 58.3: Charged Off (82.4/33)
## num_rev_accts > 7:
## :...mort_acc <= 0:
## :...purpose in {car,house,major_purchase,medical,moving,other,
## : : small_business}: Fully Paid (31.8)
## : purpose in {credit_card,debt_consolidation,home_improvement,
## : : vacation}:
## : :...pub_rec > 0:
## : :...num_il_tl <= 7: Charged Off (100.7/30)
## : : num_il_tl > 7: Fully Paid (34.1/10.6)
## : pub_rec <= 0:
## : :...num_rev_tl_bal_gt_0 <= 2: Charged Off (44.7/13)
## : num_rev_tl_bal_gt_0 > 2:
## : :...mths_since_recent_bc <= 6: Fully Paid (105.4/21.2)
## : mths_since_recent_bc > 6:
## : :...propSatisBankcardAccts <= 0.4285714: Fully Paid (80.1/21.2)
## : propSatisBankcardAccts > 0.4285714: [S65]
## mort_acc > 0:
## :...pub_rec > 1: Fully Paid (40/3.5)
## pub_rec <= 1:
## :...revol_util > 93.6: Fully Paid (47.1/3.5)
## revol_util <= 93.6:
## :...mths_since_recent_bc > 113: Fully Paid (57.1/7.1)
## mths_since_recent_bc <= 113:
## :...propSatisBankcardAccts <= 0.2222222: [S66]
## propSatisBankcardAccts > 0.2222222:
## :...num_accts_ever_120_pd > 3: Fully Paid (44.7/3.5)
## num_accts_ever_120_pd <= 3:
## :...num_il_tl > 21: Fully Paid (147.8/28.3)
## num_il_tl <= 21:
## :...acc_open_past_24mths > 7: [S67]
## acc_open_past_24mths <= 7: [S68]
##
## SubTree [S48]
##
## mths_since_recent_bc > 39: Fully Paid (114.8/17.7)
## mths_since_recent_bc <= 39:
## :...pub_rec_bankruptcies > 0: Charged Off (53.6/18.3)
## pub_rec_bankruptcies <= 0:
## :...borrHistory <= 10.67214: Fully Paid (64.2/7.1)
## borrHistory > 10.67214:
## :...total_acc <= 19: Fully Paid (47.7/7.1)
## total_acc > 19:
## :...mort_acc > 4: Fully Paid (35.9/7.1)
## mort_acc <= 4:
## :...inq_last_6mths > 1: Charged Off (67.7/21.8)
## inq_last_6mths <= 1:
## :...emp_length in {n/a,< 1 year,3 years,4 years,5 years,
## : 8 years}: Charged Off (171.9/76.5)
## emp_length in {1 year,2 years,6 years,7 years,
## : 9 years}: Fully Paid (99.5/28.3)
## emp_length = 10+ years:
## :...home_ownership in {ANY,NONE,OTHER,
## : OWN}: Fully Paid (12.4)
## home_ownership = MORTGAGE:
## :...tot_cur_bal <= 211979: Fully Paid (33/3.5)
## : tot_cur_bal > 211979: Charged Off (44.7/20)
## home_ownership = RENT:
## :...installment <= 488.14: Fully Paid (30.6/10.6)
## installment > 488.14: Charged Off (35.9/7.7)
##
## SubTree [S49]
##
## percent_bc_gt_75 <= 21.1: Charged Off (37.1/12.4)
## percent_bc_gt_75 > 21.1: Fully Paid (43.6/14.1)
##
## SubTree [S50]
##
## emp_length in {n/a,< 1 year,2 years,4 years,5 years,6 years,8 years,
## : 9 years}: Charged Off (156/71.2)
## emp_length in {1 year,3 years,7 years,10+ years}: Fully Paid (56.5/10.6)
##
## SubTree [S51]
##
## avg_cur_bal <= 590: Fully Paid (37.1)
## avg_cur_bal > 590:
## :...revol_bal <= 2172:
## :...num_accts_ever_120_pd > 0: Fully Paid (30.6/3.5)
## : num_accts_ever_120_pd <= 0:
## : :...emp_length in {2 years,3 years,4 years,
## : : 5 years}: Fully Paid (24.1/3.5)
## : emp_length in {n/a,< 1 year,1 year,6 years,7 years,8 years,9 years,
## : : 10+ years}:
## : :...propSatisBankcardAccts <= 0.5833333: Fully Paid (37.1/14.1)
## : propSatisBankcardAccts > 0.5833333: Charged Off (69.5/13)
## revol_bal > 2172:
## :...num_bc_tl > 11:
## :...sub_grade in {B1,B2,B3,B4,B5}: Fully Paid (0)
## : sub_grade = A4:
## : :...emp_length in {n/a,< 1 year,4 years,7 years,
## : : : 9 years}: Fully Paid (71.2)
## : : emp_length in {1 year,2 years,3 years,5 years,6 years,8 years,
## : : : 10+ years}:
## : : :...total_rev_hi_lim <= 34344:
## : : :...total_bal_ex_mort <= 24559: Fully Paid (37.7/10.6)
## : : : total_bal_ex_mort > 24559: Charged Off (73/23.5)
## : : total_rev_hi_lim > 34344:
## : : :...mths_since_recent_inq > 9: Fully Paid (47.7)
## : : mths_since_recent_inq <= 9:
## : : :...total_rev_hi_lim <= 52217: Fully Paid (36.5)
## : : total_rev_hi_lim > 52217:
## : : :...emp_length in {1 year,2 years,5 years,
## : : : 8 years}: Charged Off (50/18.3)
## : : emp_length in {3 years,6 years,
## : : 10+ years}: Fully Paid (61.8/10.6)
## : sub_grade = A5:
## : :...pub_rec_bankruptcies > 0: Fully Paid (41.8/10.6)
## : pub_rec_bankruptcies <= 0:
## : :...percent_bc_gt_75 <= 7.1: Fully Paid (71.8/17.7)
## : percent_bc_gt_75 > 7.1:
## : :...emp_length in {n/a,1 year,5 years,6 years,
## : : 7 years}: Charged Off (128.3/47.1)
## : emp_length in {< 1 year,2 years,3 years,4 years,
## : : 8 years,
## : : 9 years}: Fully Paid (106.6/35.3)
## : emp_length = 10+ years:
## : :...mths_since_recent_inq > 7: Fully Paid (35.9/7.1)
## : mths_since_recent_inq <= 7: [S69]
## num_bc_tl <= 11:
## :...bc_open_to_buy > 42288: Fully Paid (61.8)
## bc_open_to_buy <= 42288:
## :...revol_util <= 11.1: Charged Off (54.2/18.8)
## revol_util > 11.1:
## :...tot_hi_cred_lim > 556659: Fully Paid (51.8)
## tot_hi_cred_lim <= 556659:
## :...dti <= 3.89: Fully Paid (43)
## dti > 3.89:
## :...bc_util <= 22.6: Fully Paid (110.1/7.1)
## bc_util > 22.6:
## :...num_actv_bc_tl > 6:
## :...dti <= 17.02: Fully Paid (31.8/7.1)
## : dti > 17.02: Charged Off (37.7/13)
## num_actv_bc_tl <= 6:
## :...purpose = other: Charged Off (48.3/23.5)
## purpose in {credit_card,debt_consolidation,
## : home_improvement,house,
## : major_purchase,medical}:
## :...borrHistory <= 7.838467: Fully Paid (44.7)
## borrHistory > 7.838467:
## :...acc_open_past_24mths > 7: [S70]
## acc_open_past_24mths <= 7: [S71]
##
## SubTree [S52]
##
## purpose in {car,home_improvement,house,major_purchase,moving,
## : vacation}: Charged Off (29.4/8.2)
## purpose in {credit_card,medical,other,small_business}: Fully Paid (50/21.2)
## purpose = debt_consolidation:
## :...emp_length in {n/a,1 year,3 years,5 years,9 years}: Fully Paid (13)
## emp_length in {< 1 year,2 years,4 years,6 years,7 years,8 years,10+ years}:
## :...openAccRatio <= 0.3934426: Charged Off (39.4/4.1)
## openAccRatio > 0.3934426:
## :...grade in {C,D,F}: Fully Paid (55.3/24.7)
## grade in {A,B,E,G}: Charged Off (7.1)
##
## SubTree [S53]
##
## verification_status in {Not Verified,Source Verified}: Charged Off (63.6/28.3)
## verification_status = Verified: Fully Paid (15.3)
##
## SubTree [S54]
##
## verification_status in {Not Verified,Source Verified}: Fully Paid (51.2/17.7)
## verification_status = Verified: Charged Off (46.5/14.7)
##
## SubTree [S55]
##
## purpose in {car,moving,vacation}: Fully Paid (2.4)
## purpose in {credit_card,home_improvement,house,major_purchase,medical,other,
## : small_business}: Charged Off (118.3/26.5)
## purpose = debt_consolidation:
## :...dti > 19.51: Charged Off (105.4/24.1)
## dti <= 19.51:
## :...installment <= 413.68: Fully Paid (39.4/10.6)
## installment > 413.68:
## :...mths_since_recent_bc <= 4: Charged Off (44.7/9.4)
## mths_since_recent_bc > 4: Fully Paid (33/14.1)
##
## SubTree [S56]
##
## home_ownership in {ANY,NONE,OTHER,OWN}: Fully Paid (37.1/7.1)
## home_ownership = RENT: Charged Off (52.4/24.1)
## home_ownership = MORTGAGE:
## :...percent_bc_gt_75 <= 23.5: Fully Paid (83.6/14.1)
## percent_bc_gt_75 > 23.5:
## :...num_accts_ever_120_pd > 0:
## :...purpose in {credit_card,other,vacation}: Fully Paid (28.8)
## : purpose in {debt_consolidation,home_improvement,major_purchase}:
## : :...num_actv_bc_tl <= 3: Fully Paid (103.6/49.5)
## : num_actv_bc_tl > 3: Charged Off (91.3/24.1)
## num_accts_ever_120_pd <= 0:
## :...emp_length in {2 years,4 years}: Fully Paid (41.2)
## emp_length in {n/a,< 1 year,1 year,3 years,5 years,6 years,7 years,
## : 8 years,9 years,10+ years}:
## :...total_rev_hi_lim <= 12650: Fully Paid (41.2/3.5)
## total_rev_hi_lim > 12650:
## :...total_acc <= 16:
## :...mths_since_recent_bc <= 14: Fully Paid (30.6/10.6)
## : mths_since_recent_bc > 14: Charged Off (55.3/9.4)
## total_acc > 16:
## :...mths_since_recent_bc > 48: Fully Paid (42.4/3.5)
## mths_since_recent_bc <= 48:
## :...int_rate <= 11.55: Fully Paid (33.6/3.5)
## int_rate > 11.55:
## :...mo_sin_rcnt_rev_tl_op > 17: Fully Paid (50.6/10.6)
## mo_sin_rcnt_rev_tl_op <= 17:
## :...num_bc_tl > 13: Charged Off (32.4/7.7)
## num_bc_tl <= 13:
## :...borrHistory > 14.59001: Fully Paid (122.5/38.9)
## borrHistory <= 14.59001: [S72]
##
## SubTree [S57]
##
## mths_since_recent_bc <= 18: Charged Off (177.2/53.6)
## mths_since_recent_bc > 18: Fully Paid (37.7/14.1)
##
## SubTree [S58]
##
## emp_length in {n/a,< 1 year,2 years,4 years,5 years,
## : 6 years}: Fully Paid (22.4)
## emp_length in {1 year,3 years,7 years,8 years,9 years,10+ years}:
## :...grade = B: Fully Paid (25.9/10.6)
## grade in {A,C,D,E,F,G}: Charged Off (55.9/17.1)
##
## SubTree [S59]
##
## home_ownership in {ANY,NONE,OTHER,OWN}: Fully Paid (21.8)
## home_ownership = MORTGAGE:
## :...mths_since_recent_bc <= 12: Fully Paid (51.2/17.7)
## : mths_since_recent_bc > 12:
## : :...purpose in {car,credit_card,debt_consolidation,moving,small_business,
## : : vacation}: Charged Off (57.1/14.7)
## : purpose in {home_improvement,major_purchase,medical,
## : other}: Fully Paid (5.3)
## home_ownership = RENT:
## :...total_acc <= 13: Charged Off (50.6/15.3)
## total_acc > 13:
## :...mths_since_recent_inq <= 3: Charged Off (54.8/19.4)
## mths_since_recent_inq > 3: Fully Paid (98.9/24.7)
##
## SubTree [S60]
##
## purpose in {car,major_purchase,small_business}: Fully Paid (11.2/3.5)
## purpose in {home_improvement,medical,moving,other,
## : vacation}: Charged Off (68.3/15.3)
## purpose = credit_card:
## :...home_ownership in {MORTGAGE,OWN}: Fully Paid (23.5/7.1)
## : home_ownership in {ANY,NONE,OTHER,RENT}: Charged Off (65.3/19.4)
## purpose = debt_consolidation:
## :...mo_sin_rcnt_tl <= 1: Charged Off (57.7/11.8)
## mo_sin_rcnt_tl > 1:
## :...mo_sin_old_il_acct > 138: Fully Paid (37.1/3.5)
## mo_sin_old_il_acct <= 138:
## :...num_bc_tl > 6: Charged Off (50.6/11.8)
## num_bc_tl <= 6:
## :...total_acc > 17: Fully Paid (37.7/10.6)
## total_acc <= 17:
## :...num_actv_bc_tl <= 2: Fully Paid (58.9/28.3)
## num_actv_bc_tl > 2: Charged Off (41.2/9.4)
##
## SubTree [S61]
##
## purpose in {car,small_business}: Charged Off (14.1/3.5)
## purpose in {home_improvement,major_purchase,medical,moving,other,
## : vacation}: Fully Paid (27.1/3.5)
## purpose = credit_card:
## :...num_actv_rev_tl <= 4: Charged Off (38.9/14.1)
## : num_actv_rev_tl > 4: Fully Paid (42.4/7.1)
## purpose = debt_consolidation:
## :...tot_hi_cred_lim <= 37868: Fully Paid (39.4)
## tot_hi_cred_lim > 37868:
## :...num_il_tl <= 3: Charged Off (32.4/7.7)
## num_il_tl > 3:
## :...total_acc <= 23: Fully Paid (66.5/14.1)
## total_acc > 23: Charged Off (35.3/10.6)
##
## SubTree [S62]
##
## purpose in {car,home_improvement,moving,
## : small_business}: Charged Off (25.9/8.2)
## purpose in {major_purchase,medical,other,vacation}: Fully Paid (13)
## purpose = credit_card:
## :...verification_status = Not Verified: Fully Paid (51.8/21.2)
## : verification_status in {Source Verified,Verified}: Charged Off (88.9/35.9)
## purpose = debt_consolidation:
## :...home_ownership in {ANY,MORTGAGE,NONE,OTHER,OWN}: Fully Paid (50.6/10.6)
## home_ownership = RENT:
## :...installment > 342.38: Charged Off (93.6/26.5)
## installment <= 342.38:
## :...bc_util <= 95: Fully Paid (70.6/17.7)
## bc_util > 95: Charged Off (30/8.8)
##
## SubTree [S63]
##
## total_il_high_credit_limit > 42255: Fully Paid (31.2)
## total_il_high_credit_limit <= 42255:
## :...openAccRatio > 0.8809524: Charged Off (31.8/10.6)
## openAccRatio <= 0.8809524:
## :...annual_inc <= 27400: Charged Off (41.8/17.1)
## annual_inc > 27400:
## :...mo_sin_old_il_acct > 135: Fully Paid (64.2)
## mo_sin_old_il_acct <= 135:
## :...tot_coll_amt > 62: Charged Off (30/12.4)
## tot_coll_amt <= 62:
## :...revol_util > 75.7: Fully Paid (45.9)
## revol_util <= 75.7:
## :...mo_sin_old_rev_tl_op <= 125: Fully Paid (68.3/7.1)
## mo_sin_old_rev_tl_op > 125:
## :...open_acc <= 6: Charged Off (45.9/14.1)
## open_acc > 6: Fully Paid (44.7/14.1)
##
## SubTree [S64]
##
## purpose in {car,home_improvement,major_purchase,medical,moving,other,
## : small_business}: Fully Paid (232/53)
## purpose in {house,vacation}: Charged Off (18.3/4.1)
## purpose = debt_consolidation:
## :...num_op_rev_tl <= 4: Fully Paid (104.2/3.5)
## : num_op_rev_tl > 4:
## : :...num_bc_tl <= 4: Fully Paid (40.6)
## : num_bc_tl > 4:
## : :...total_acc <= 12: Charged Off (30.6/13)
## : total_acc > 12:
## : :...num_actv_rev_tl <= 2: Fully Paid (37.1)
## : num_actv_rev_tl > 2:
## : :...total_bal_ex_mort > 12632: Fully Paid (496.9/88.3)
## : total_bal_ex_mort <= 12632:
## : :...home_ownership in {ANY,MORTGAGE,NONE,OTHER,
## : : OWN}: Charged Off (52.4/20.6)
## : home_ownership = RENT: Fully Paid (9.4)
## purpose = credit_card:
## :...acc_open_past_24mths > 5:
## :...home_ownership in {ANY,NONE,OTHER,OWN}: Fully Paid (5.3)
## : home_ownership = RENT: Charged Off (39.4/11.2)
## : home_ownership = MORTGAGE:
## : :...borrHistory <= 16.67077: Fully Paid (33/3.5)
## : borrHistory > 16.67077: Charged Off (36.5/15.3)
## acc_open_past_24mths <= 5:
## :...annual_inc > 118224: Fully Paid (67.1)
## annual_inc <= 118224:
## :...installment <= 244.5: Fully Paid (81.8/3.5)
## installment > 244.5:
## :...propSatisBankcardAccts <= 0.2580645: Charged Off (34.1/13)
## propSatisBankcardAccts > 0.2580645:
## :...mort_acc > 1: Fully Paid (132.5/17.7)
## mort_acc <= 1:
## :...percent_bc_gt_75 > 47.1: Fully Paid (58.9/7.1)
## percent_bc_gt_75 <= 47.1:
## :...num_rev_accts > 15: Charged Off (58.9/20)
## num_rev_accts <= 15:
## :...dti <= 19.08: Fully Paid (46.5/3.5)
## dti > 19.08: Charged Off (37.7/16.5)
##
## SubTree [S65]
##
## verification_status = Verified: Fully Paid (54.2/17.7)
## verification_status in {Not Verified,Source Verified}:
## :...acc_open_past_24mths > 4: Charged Off (101.3/27.1)
## acc_open_past_24mths <= 4:
## :...sub_grade in {A4,A5}: Fully Paid (0)
## sub_grade in {B1,B2}:
## :...mo_sin_old_il_acct <= 153: Fully Paid (53.6/7.1)
## : mo_sin_old_il_acct > 153: Charged Off (31.8/10.6)
## sub_grade in {B3,B4,B5}:
## :...open_acc <= 10: Fully Paid (31.2/14.1)
## open_acc > 10: Charged Off (45.3/10)
##
## SubTree [S66]
##
## purpose in {car,home_improvement,house,moving,small_business,
## : vacation}: Charged Off (30.6/13)
## purpose in {credit_card,major_purchase,medical,other}: Fully Paid (49.5/14.1)
## purpose = debt_consolidation:
## :...emp_length in {n/a,< 1 year,1 year,2 years,
## : 8 years}: Charged Off (55.3/13)
## emp_length in {3 years,4 years,5 years,6 years,7 years,
## : 9 years}: Fully Paid (13.5)
## emp_length = 10+ years:
## :...openAccRatio <= 0.2708333: Charged Off (38.3/10)
## openAccRatio > 0.2708333: Fully Paid (30.6/10.6)
##
## SubTree [S67]
##
## mths_since_last_delinq > 32: Fully Paid (84.8/14.1)
## mths_since_last_delinq <= 32:
## :...borrHistory <= 14.00137:
## :...installment <= 308.23: Fully Paid (40/14.1)
## : installment > 308.23: Charged Off (126.6/34.7)
## borrHistory > 14.00137:
## :...home_ownership = MORTGAGE: Fully Paid (167.8/63.6)
## home_ownership = OWN: Charged Off (25.3/7.7)
##
## SubTree [S68]
##
## mo_sin_old_il_acct <= 56: Fully Paid (107.7/17.7)
## mo_sin_old_il_acct > 56:
## :...num_il_tl > 6:
## :...num_tl_90g_dpd_24m > 0: Fully Paid (43)
## : num_tl_90g_dpd_24m <= 0:
## : :...emp_length in {< 1 year,1 year,3 years,4 years,5 years,6 years,
## : : 7 years,9 years}: Fully Paid (452.7/120.1)
## : emp_length = n/a:
## : :...sub_grade in {A4,A5,B1,B3,B4}: Charged Off (54.2/22.4)
## : : sub_grade in {B2,B5}: Fully Paid (10)
## : emp_length = 2 years:
## : :...dti <= 21.38: Fully Paid (42.4/7.1)
## : : dti > 21.38: Charged Off (39.4/11.2)
## : emp_length = 8 years:
## : :...num_op_rev_tl <= 9: Fully Paid (33.6/3.5)
## : : num_op_rev_tl > 9: Charged Off (34.1/13)
## : emp_length = 10+ years:
## : :...bc_open_to_buy <= 591: Fully Paid (30)
## : bc_open_to_buy > 591:
## : :...tot_hi_cred_lim <= 145374: Fully Paid (60.1/3.5)
## : tot_hi_cred_lim > 145374:
## : :...int_rate <= 8.9: Fully Paid (48.9/7.1)
## : int_rate > 8.9:
## : :...delinq_2yrs > 0: Charged Off (96/43)
## : delinq_2yrs <= 0:
## : :...mo_sin_rcnt_tl > 5: Fully Paid (120.7/35.3)
## : mo_sin_rcnt_tl <= 5:
## : :...total_acc <= 27: Charged Off (44.7/16.5)
## : total_acc > 27: Fully Paid (75.4/24.7)
## num_il_tl <= 6:
## :...mo_sin_old_rev_tl_op <= 98: Charged Off (96.6/36.5)
## mo_sin_old_rev_tl_op > 98:
## :...delinq_2yrs > 1:
## :...num_rev_accts > 19: Fully Paid (30.6/7.1)
## : num_rev_accts <= 19:
## : :...num_il_tl <= 4: Fully Paid (40/17.7)
## : num_il_tl > 4: Charged Off (54.8/12.4)
## delinq_2yrs <= 1:
## :...mths_since_last_delinq <= 9: Fully Paid (34.1)
## mths_since_last_delinq > 9:
## :...mo_sin_rcnt_tl > 15: Fully Paid (134.8/24.7)
## mo_sin_rcnt_tl <= 15:
## :...int_rate <= 8.9: Fully Paid (113/28.3)
## int_rate > 8.9:
## :...emp_length in {2 years,3 years,
## : 7 years}: Fully Paid (129.5/24.7)
## emp_length in {n/a,< 1 year,1 year,4 years,5 years,
## : 6 years,8 years,9 years,10+ years}: [S73]
##
## SubTree [S69]
##
## purpose = home_improvement: Fully Paid (1.2)
## purpose in {house,major_purchase,medical,other}: Charged Off (7.1)
## purpose in {credit_card,debt_consolidation}:
## :...mort_acc <= 2: Fully Paid (31.2/10.6)
## mort_acc > 2: Charged Off (62.4/20)
##
## SubTree [S70]
##
## revol_util <= 51.9: Fully Paid (45.9/7.1)
## revol_util > 51.9: Charged Off (50/18.3)
##
## SubTree [S71]
##
## acc_open_past_24mths > 6: Fully Paid (53/7.1)
## acc_open_past_24mths <= 6:
## :...mo_sin_old_rev_tl_op > 422: Charged Off (54.8/26.5)
## mo_sin_old_rev_tl_op <= 422:
## :...mo_sin_old_rev_tl_op > 370: Fully Paid (33.6)
## mo_sin_old_rev_tl_op <= 370:
## :...borrHistory > 30.42026: Charged Off (35.9/14.7)
## borrHistory <= 30.42026:
## :...mths_since_recent_inq <= 1:
## :...mths_since_last_delinq > 31: Fully Paid (33/3.5)
## : mths_since_last_delinq <= 31: [S74]
## mths_since_recent_inq > 1:
## :...mths_since_last_delinq > 51:
## :...purpose in {credit_card,house,major_purchase,
## : : medical}: Fully Paid (51.2/10.6)
## : purpose = home_improvement: Charged Off (11.2/4.1)
## : purpose = debt_consolidation:
## : :...mort_acc > 2: Fully Paid (30/3.5)
## : mort_acc <= 2:
## : :...mths_since_recent_inq <= 6: Fully Paid (33/10.6)
## : mths_since_recent_inq > 6: Charged Off (52.4/17.1)
## mths_since_last_delinq <= 51:
## :...num_rev_accts <= 5:
## :...total_rev_hi_lim <= 9200: Charged Off (30/8.8)
## : total_rev_hi_lim > 9200: Fully Paid (61.8/10.6)
## num_rev_accts > 5:
## :...mo_sin_old_il_acct > 193: Fully Paid (69.5)
## mo_sin_old_il_acct <= 193:
## :...num_bc_tl <= 4: Fully Paid (146/10.6)
## num_bc_tl > 4:
## :...num_tl_op_past_12m <= 0: Fully Paid (212.5/28.3)
## num_tl_op_past_12m > 0:
## :...borrHistory > 26.42026: Fully Paid (35.3)
## borrHistory <= 26.42026:
## :...openAccRatio > 0.6521739: Fully Paid (57.1/3.5)
## openAccRatio <= 0.6521739:
## :...emp_length in {n/a,2 years,
## : 4 years}: [S75]
## emp_length in {< 1 year,1 year,
## : 3 years,5 years,
## : 6 years,7 years,
## : 8 years,9 years,
## : 10+ years}: [S76]
##
## SubTree [S72]
##
## sub_grade in {B3,C2,G4}: Charged Off (53/10.6)
## sub_grade in {B4,B5,C1}: Fully Paid (60.6/28.3)
##
## SubTree [S73]
##
## purpose in {car,other}: Charged Off (37.1/12.4)
## purpose in {home_improvement,house,major_purchase,medical,moving,
## : small_business,vacation}: Fully Paid (65.3/24.7)
## purpose = credit_card:
## :...bc_open_to_buy > 3747:
## : :...mo_sin_rcnt_tl <= 2: Charged Off (31.8/14.1)
## : : mo_sin_rcnt_tl > 2: Fully Paid (83.6/7.1)
## : bc_open_to_buy <= 3747:
## : :...emp_length in {< 1 year,5 years,8 years}: Fully Paid (10)
## : emp_length in {n/a,1 year,4 years,6 years,9 years,10+ years}:
## : :...total_il_high_credit_limit <= 5125: Charged Off (34.7/6.5)
## : total_il_high_credit_limit > 5125: Fully Paid (39.4/17.7)
## purpose = debt_consolidation:
## :...inq_last_6mths > 1: Charged Off (63.6/21.2)
## inq_last_6mths <= 1:
## :...num_il_tl > 5:
## :...propSatisBankcardAccts <= 0.5625: Charged Off (54.8/12.4)
## : propSatisBankcardAccts > 0.5625:
## : :...open_acc <= 10: Charged Off (35.9/14.7)
## : open_acc > 10: Fully Paid (38.9/10.6)
## num_il_tl <= 5:
## :...num_bc_sats <= 3:
## :...num_il_tl <= 3: Charged Off (45.3/13.5)
## : num_il_tl > 3: Fully Paid (44.2/17.7)
## num_bc_sats > 3:
## :...tot_hi_cred_lim <= 179650: Fully Paid (43)
## tot_hi_cred_lim > 179650:
## :...acc_open_past_24mths <= 3: Fully Paid (50.6/10.6)
## acc_open_past_24mths > 3: Charged Off (77.1/34.7)
##
## SubTree [S74]
##
## verification_status in {Not Verified,Verified}: Fully Paid (66.5/17.7)
## verification_status = Source Verified: Charged Off (71.2/28.8)
##
## SubTree [S75]
##
## bc_util <= 57.1: Fully Paid (37.1/7.1)
## bc_util > 57.1:
## :...openAccRatio <= 0.53125: Fully Paid (55.9/24.7)
## openAccRatio > 0.53125: Charged Off (34.1/5.9)
##
## SubTree [S76]
##
## tot_coll_amt > 64: Fully Paid (38.9)
## tot_coll_amt <= 64:
## :...total_rev_hi_lim <= 11950: Charged Off (39.4/14.7)
## total_rev_hi_lim > 11950:
## :...mths_since_last_delinq <= 18: Fully Paid (34.7)
## mths_since_last_delinq > 18:
## :...loan_amnt <= 7700:
## :...annual_inc <= 77550: Fully Paid (45.9/14.1)
## : annual_inc > 77550: Charged Off (30.6/9.4)
## loan_amnt > 7700:
## :...annual_inc > 72400: Fully Paid (142.5/7.1)
## annual_inc <= 72400:
## :...emp_length in {< 1 year,5 years,6 years,7 years,
## : 9 years}: Fully Paid (46.5/3.5)
## emp_length in {1 year,3 years,8 years,10+ years}:
## :...mo_sin_old_il_acct <= 131: Fully Paid (55.3/14.1)
## mo_sin_old_il_acct > 131: Charged Off (47.7/15.9)
##
##
## Evaluation on training data (54963 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 788 16522(30.1%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 6302 1377 (a): class Charged Off
## 15145 32139 (b): class Fully Paid
##
##
## Attribute usage:
##
## 100.00% int_rate
## 100.00% sub_grade
## 62.78% purpose
## 57.81% acc_open_past_24mths
## 57.70% mort_acc
## 55.81% num_rev_accts
## 55.52% grade
## 48.29% mo_sin_rcnt_tl
## 46.84% tax_liens
## 46.00% mo_sin_old_il_acct
## 40.67% mo_sin_old_rev_tl_op
## 38.99% avg_cur_bal
## 38.47% mths_since_recent_bc
## 37.92% num_accts_ever_120_pd
## 36.79% annual_inc
## 34.41% num_rev_tl_bal_gt_0
## 34.13% installment
## 32.26% home_ownership
## 30.29% delinq_2yrs
## 30.17% tot_hi_cred_lim
## 29.75% openAccRatio
## 29.53% num_bc_sats
## 28.51% loan_amnt
## 27.64% total_acc
## 25.92% revol_bal
## 25.65% num_bc_tl
## 24.68% emp_length
## 23.49% revol_util
## 22.82% pub_rec
## 21.85% inq_last_6mths
## 21.60% dti
## 21.55% mo_sin_rcnt_rev_tl_op
## 20.10% num_op_rev_tl
## 19.19% bc_util
## 18.41% mths_since_last_delinq
## 17.78% num_actv_rev_tl
## 16.78% bc_open_to_buy
## 16.71% num_il_tl
## 15.10% propSatisBankcardAccts
## 13.92% collections_12_mths_ex_med
## 13.77% num_tl_90g_dpd_24m
## 13.42% mths_since_recent_inq
## 13.08% borrHistory
## 12.22% chargeoff_within_12_mths
## 12.01% total_bc_limit
## 10.66% pub_rec_bankruptcies
## 9.91% verification_status
## 9.34% total_bal_ex_mort
## 9.09% tot_coll_amt
## 8.99% pct_tl_nvr_dlq
## 7.87% total_rev_hi_lim
## 7.87% num_tl_op_past_12m
## 7.70% num_actv_bc_tl
## 7.17% tot_cur_bal
## 6.81% open_acc
## 5.90% percent_bc_gt_75
## 5.74% total_il_high_credit_limit
## 2.95% initial_list_status
## 2.64% num_sats
##
##
## Time: 2.8 secs
predTrn <- predict(c5_DT2, lcdfTrn, type='class')
confusionMatrix(predTrn, lcdfTrn$loan_status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Charged Off Fully Paid
## Charged Off 6302 15145
## Fully Paid 1377 32139
##
## Accuracy : 0.6994
## 95% CI : (0.6955, 0.7032)
## No Information Rate : 0.8603
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2858
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.8207
## Specificity : 0.6797
## Pos Pred Value : 0.2938
## Neg Pred Value : 0.9589
## Prevalence : 0.1397
## Detection Rate : 0.1147
## Detection Prevalence : 0.3902
## Balanced Accuracy : 0.7502
##
## 'Positive' Class : Charged Off
##
# Test Prediction
predTst <- predict(c5_DT2, lcdfTst, type='prob')
table(pred = predTst[,'Fully Paid' ] > CTHRESH, true=lcdfTst$loan_status)
## true
## pred Charged Off Fully Paid
## FALSE 2542 8980
## TRUE 5155 38286
The model has predicted charged off loans well with overall accuracy 70% and more over the parameters which we are looking for in terms of prediction with high sensitivity of 82% with a balance of specificity of 67%.
#ROC curve and AUC value
fnROCPerformance <- function(scores, dat)
{ #Note the label-ordering - so, scores should be prob of 'Fully Paid'
pred=prediction(scores, dat$loan_status, label.ordering = c("Charged Off", "Fully Paid" ))
#ROC curve
aucPerf <-performance(pred, "tpr", "fpr")
plot(aucPerf)
abline(a=0, b= 1)
#AUC value
aucPerf=performance(pred, "auc")
sprintf("AUC: %f", aucPerf@y.values)
}
#decile lift performance, for minority class (Charged Off")
# the 'score' parameter should gice 'prob' of loan_status == 'Charged Off'
fnDecileLiftsPerformance_defaults <- function( scores, dat) { #score is for loan_status=='Charged Off'
totDefRate= sum(dat$loan_status=="Charged Off")/nrow(dat)
decPerf <- data.frame(scores)
decPerf <- cbind(decPerf, status=dat$loan_status, grade=dat$grade)
decPerf <- decPerf %>% mutate(decile = ntile(-scores, 10))
decPerf<- decPerf %>% group_by(decile) %>% summarise (
count=n(), numDefaults=sum(status=="Charged Off"), defaultRate=numDefaults/count,
totA=sum(grade=="A"),totB=sum(grade=="B" ), totC=sum(grade=="C"), totD=sum(grade=="D"),
totE=sum(grade=="E"),totF=sum(grade=="F") )
decPerf$cumDefaults=cumsum(decPerf$numDefaults)
decPerf$cumDefaultRate=decPerf$cumDefaults/cumsum(decPerf$count)
decPerf$cumDefaultLift<- decPerf$cumDefaultRate/(sum(decPerf$numDefaults)/sum(decPerf$count))
print(decPerf)
}
#Returns performance by deciles
fnDecileReturnsPerformance <- function( scores, dat) {
decRetPerf <- data.frame(scores)
decRetPerf <- cbind(decRetPerf, status=dat$loan_status, grade=dat$grade, actRet=dat$actualReturn, actTerm = dat$actualTerm)
decRetPerf <- decRetPerf %>% mutate(decile = ntile(-scores, 10))
decRetPerf %>% group_by(decile) %>% summarise (
count=n(), numDefaults=sum(status=="Charged Off"), avgActRet=mean(actRet), minRet=min(actRet), maxRet=max(actRet),
avgTer=mean(actTerm), totA=sum(grade=="A"), totB=sum(grade=="B" ), totC=sum(grade=="C"), totD=sum(grade=="D"),
totE=sum(grade=="E"), totF=sum(grade=="F") )
}
library(ranger)
rfModel1 <- ranger(loan_status ~., data=lcdfTrn %>% select(-all_of(varsOmit)), num.trees = 200, importance='permutation', probability = TRUE)
#variable importance
vimp_rfGp<-importance(rfModel1)
vimp_rfGp
## loan_amnt funded_amnt
## 0.0067023775191 0.0063851559469
## int_rate installment
## 0.0080996959599 0.0081393794119
## grade sub_grade
## 0.0050149624239 0.0065764098532
## emp_length home_ownership
## 0.0003256535211 0.0012283948999
## annual_inc verification_status
## 0.0051228535142 0.0002639883578
## purpose dti
## 0.0004589120114 0.0027138928378
## delinq_2yrs inq_last_6mths
## 0.0002986782499 0.0005323565645
## mths_since_last_delinq open_acc
## 0.0004755147498 0.0023867202224
## pub_rec revol_bal
## 0.0002168992769 0.0056280083655
## revol_util total_acc
## 0.0040358053421 0.0033074726580
## initial_list_status collections_12_mths_ex_med
## 0.0000139302489 0.0000029976816
## acc_now_delinq tot_coll_amt
## -0.0000038061181 0.0001734957221
## tot_cur_bal total_rev_hi_lim
## 0.0140438317588 0.0086723663860
## acc_open_past_24mths avg_cur_bal
## 0.0041461658185 0.0107875883535
## bc_open_to_buy bc_util
## 0.0079197615680 0.0057298582939
## chargeoff_within_12_mths delinq_amnt
## 0.0000088264487 -0.0000053570342
## mo_sin_old_il_acct mo_sin_old_rev_tl_op
## 0.0011953831894 0.0038980814634
## mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl
## 0.0017083267055 0.0017432326779
## mort_acc mths_since_recent_bc
## 0.0015934917213 0.0018951713713
## mths_since_recent_inq num_accts_ever_120_pd
## 0.0005958597093 0.0002803512882
## num_actv_bc_tl num_actv_rev_tl
## 0.0016769522289 0.0031269643619
## num_bc_sats num_bc_tl
## 0.0021724099676 0.0022200856228
## num_il_tl num_op_rev_tl
## 0.0016391916977 0.0027562232542
## num_rev_accts num_rev_tl_bal_gt_0
## 0.0030553543873 0.0030136620593
## num_sats num_tl_120dpd_2m
## 0.0024839646710 0.0000001853568
## num_tl_30dpd num_tl_90g_dpd_24m
## -0.0000027085297 0.0000803676432
## num_tl_op_past_12m pct_tl_nvr_dlq
## 0.0025095798993 0.0007987227241
## percent_bc_gt_75 pub_rec_bankruptcies
## 0.0034834650869 0.0001539274437
## tax_liens tot_hi_cred_lim
## 0.0000548721622 0.0143403669213
## total_bal_ex_mort total_bc_limit
## 0.0056982877549 0.0089991442105
## total_il_high_credit_limit propSatisBankcardAccts
## 0.0035417474762 0.0014606588488
## borrHistory openAccRatio
## 0.0039326522675 0.0023554885355
#Get the predictions -- look into the returned object
scoreTrn <- predict(rfModel1,lcdfTrn) # This will have score of charged and fully paid
head(scoreTrn$predictions)
## Charged Off Fully Paid
## [1,] 0.68664087 0.3133591
## [2,] 0.03150397 0.9684960
## [3,] 0.18383929 0.8161607
## [4,] 0.02380556 0.9761944
## [5,] 0.15837500 0.8416250
## [6,] 0.09663294 0.9033671
#classification performance , at specific threshold
table(pred = scoreTrn$predictions[, "Fully Paid"] > 0.7, actual=lcdfTrn$loan_status)
## actual
## pred Charged Off Fully Paid
## FALSE 7678 2
## TRUE 1 47282
scoreTst <- predict(rfModel1,lcdfTst)
# Table for the test dataset
table(pred = scoreTst$predictions[, "Fully Paid"] > 0.7, actual=lcdfTst$loan_status)
## actual
## pred Charged Off Fully Paid
## FALSE 1173 2553
## TRUE 6524 44713
#ROC curve, AUC
pred=prediction(scoreTrn$predictions[, "Fully Paid"], lcdfTrn$loan_status, label.ordering = c("Charged Off","Fully Paid" )) #ROC curve
aucPerf <-performance(pred, "tpr", "fpr")
plot(aucPerf)
abline(a=0, b= 1)
#AUC value
aucPerf=performance(pred, "auc")
sprintf("AUC: %f", aucPerf@y.values)
## [1] "AUC: 1.000000"
# We will use the performance function created above
fnROCPerformance(predict(rfModel1,lcdfTst)$predictions[,"Fully Paid"], dat=lcdfTst)
## [1] "AUC: 0.671988"
#for decile defaults-lift performance
fnDecileLiftsPerformance_defaults( predict(rfModel1,lcdfTrn)$predictions[,"Charged Off"], lcdfTrn )
## # A tibble: 10 × 13
## decile count numDefaults defaultRate totA totB totC totD totE totF
## <int> <int> <int> <dbl> <int> <int> <int> <int> <int> <int>
## 1 1 5497 5497 1 127 1037 2208 1473 517 118
## 2 2 5497 2182 0.397 579 1419 1497 1214 596 170
## 3 3 5497 0 0 116 982 2355 1581 407 55
## 4 4 5496 0 0 231 1538 2500 1010 202 15
## 5 5 5496 0 0 388 2070 2238 692 99 9
## 6 6 5496 0 0 669 2605 1770 410 38 4
## 7 7 5496 0 0 1069 3014 1202 192 19 0
## 8 8 5496 0 0 1835 2970 626 65 0 0
## 9 9 5496 0 0 2957 2306 225 7 1 0
## 10 10 5496 0 0 4532 936 28 0 0 0
## # … with 3 more variables: cumDefaults <int>, cumDefaultRate <dbl>,
## # cumDefaultLift <dbl>
#Note- this function calculates lifts for the minority class - so score should be prob of "charged off'
# Since we are looking for returns we will use fully paid
# Creating the a new random forest model - changing few model parameters
#Different parameters for random forest - for example, if the default model is seen to overfit
# Specifing the minimum node size to 50 and max depth of 15
rfModel2 <- ranger(loan_status ~., data=lcdfTrn %>% select(-all_of(varsOmit)),
num.trees =500, probability = TRUE, min.node.size = 50, max.depth = 15, importance = 'permutation')
#variable importance
vimp_rfGp<-importance(rfModel2)
vimp_rfGp
## loan_amnt funded_amnt
## 0.0032331866873 0.0030695998969
## int_rate installment
## 0.0042075540914 0.0040225399403
## grade sub_grade
## 0.0026288406579 0.0040548665299
## emp_length home_ownership
## 0.0001099525200 0.0004570898009
## annual_inc verification_status
## 0.0027306521976 0.0001094136862
## purpose dti
## 0.0001613393573 0.0014463055252
## delinq_2yrs inq_last_6mths
## 0.0001297431854 0.0002604370702
## mths_since_last_delinq open_acc
## 0.0001634374929 0.0009284563645
## pub_rec revol_bal
## 0.0000775631690 0.0024071874451
## revol_util total_acc
## 0.0015624228017 0.0012518809545
## initial_list_status collections_12_mths_ex_med
## 0.0000051478537 0.0000108224846
## acc_now_delinq tot_coll_amt
## 0.0000004825803 0.0000723507557
## tot_cur_bal total_rev_hi_lim
## 0.0083705583218 0.0040774380818
## acc_open_past_24mths avg_cur_bal
## 0.0021478201947 0.0059819933980
## bc_open_to_buy bc_util
## 0.0038558108203 0.0021213096643
## chargeoff_within_12_mths delinq_amnt
## 0.0000053001877 -0.0000068284083
## mo_sin_old_il_acct mo_sin_old_rev_tl_op
## 0.0004965344104 0.0016102501826
## mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl
## 0.0007281452803 0.0008792674532
## mort_acc mths_since_recent_bc
## 0.0008146087031 0.0008696488467
## mths_since_recent_inq num_accts_ever_120_pd
## 0.0002635613209 0.0001062269911
## num_actv_bc_tl num_actv_rev_tl
## 0.0007145764758 0.0015388622264
## num_bc_sats num_bc_tl
## 0.0008652706520 0.0007130449575
## num_il_tl num_op_rev_tl
## 0.0004956486830 0.0011701607579
## num_rev_accts num_rev_tl_bal_gt_0
## 0.0009907745000 0.0016337494792
## num_sats num_tl_120dpd_2m
## 0.0009691402859 -0.0000018935187
## num_tl_30dpd num_tl_90g_dpd_24m
## 0.0000004807062 0.0000449269247
## num_tl_op_past_12m pct_tl_nvr_dlq
## 0.0011581870624 0.0002941440929
## percent_bc_gt_75 pub_rec_bankruptcies
## 0.0014778614991 0.0000751504767
## tax_liens tot_hi_cred_lim
## 0.0000180908517 0.0089848827013
## total_bal_ex_mort total_bc_limit
## 0.0024090688156 0.0043226887643
## total_il_high_credit_limit propSatisBankcardAccts
## 0.0015007967204 0.0004025357906
## borrHistory openAccRatio
## 0.0015329492631 0.0008486871635
#Get the predictions -- look into the returned object
scoreTrn <- predict(rfModel2,lcdfTrn)
head(scoreTrn$predictions)
## Charged Off Fully Paid
## [1,] 0.48712094 0.5128791
## [2,] 0.04910939 0.9508906
## [3,] 0.17969737 0.8203026
## [4,] 0.02963081 0.9703692
## [5,] 0.24087784 0.7591222
## [6,] 0.16591889 0.8340811
#classification performance , at specific threshold
table(pred = scoreTrn$predictions[, "Fully Paid"] > 0.7, actual=lcdfTrn$loan_status)
## actual
## pred Charged Off Fully Paid
## FALSE 2993 198
## TRUE 4686 47086
# Checking the same on test data
scoreTst <- predict(rfModel2,lcdfTst)
table(pred = scoreTst$predictions[, "Fully Paid"] > 0.7, actual=lcdfTst$loan_status)
## actual
## pred Charged Off Fully Paid
## FALSE 638 1080
## TRUE 7059 46186
#ROC curve, AUC
pred=prediction(scoreTrn$predictions[, "Fully Paid"], lcdfTrn$loan_status, label.ordering = c("Charged Off","Fully Paid" )) #ROC curve
aucPerf <-performance(pred, "tpr", "fpr")
plot(aucPerf)
abline(a=0, b= 1)
#AUC value
aucPerf=performance(pred, "auc")
sprintf("AUC: %f", aucPerf@y.values)
## [1] "AUC: 0.925597"
#Or call the performance function defined above
fnROCPerformance(predict(rfModel2,lcdfTst)$predictions[,"Fully Paid"], dat=lcdfTst)
## [1] "AUC: 0.683890"
#Note- this function calculates lifts for the minority class - so score should be prob of "charged off'
#for decile returns performance
fnDecileReturnsPerformance( predict(rfModel2,lcdfTrn)$predictions[,"Fully Paid"], lcdfTrn )
## # A tibble: 10 × 13
## decile count numDefaults avgActRet minRet maxRet avgTer totA totB totC
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 1 5497 1 4.51 -9.37 13.8 2.16 5392 105 0
## 2 2 5497 1 5.49 -4.41 17.8 2.19 3762 1709 25
## 3 3 5497 19 6.47 -19.6 30.9 2.15 1653 3651 182
## 4 4 5496 64 7.09 -30.3 22.3 2.16 681 4093 640
## 5 5 5496 151 7.59 -33.3 29.4 2.15 338 3594 1355
## 6 6 5496 304 7.75 -33.3 34.8 2.17 201 2464 2350
## 7 7 5496 471 7.91 -32.3 34.1 2.17 138 1364 3100
## 8 8 5496 723 7.67 -32.3 31.1 2.21 125 739 3059
## 9 9 5496 1526 5.09 -32.3 32.0 2.34 110 658 2139
## 10 10 5496 4419 -7.51 -33.3 44.4 2.82 103 500 1799
## # … with 3 more variables: totD <int>, totE <int>, totF <int>
Our aim was to find features we should consider while making investment decision, while we started with 150+ variables, we analyzed each variable and its relation with the target variable - loan status. There are several factors to be considered - The actual return may vary from the interest shown. There are certain factors like loan grade, sub grade which are really important based on our analysis. Let us look at metric and decide as per our use case
While we would want to maxize two things at once – there is a precision recall tradeoff -
Reference: https://towardsdatascience.com/the-5-classification-evaluation-metrics-you-must-know-aa97784ff226 Since we are investing with an idea of minimizing the loss we will take the following text confusion matrix obtained by the model
Based on our analysis, we would get an annual return of 5.5% by investing in lower risk loans – which our model predicts as Fully Paid with a accuracy of 65%. This is the return that can be expected when investing in the loan. The potential loss is -12% annually based on our calculation. The loss encompasses the return we will lose in safer investment options like CD, Savings account which provide a ~2% annual return. Hence reiterating our goal of minimizing loss while maintaining a reasonable rate of return.
For example an investment of 100 dollar should have returned 116.5 dollar at 5.5% return, but we have observed the loans are repaid by the end of 2nd year. Hence our return would be lower – We assume that the money received is added to other investment option like CD, Savings account in the last one year (which could potentially give a return of 2%), hence actual return after 3 years 113.2$.
Based on the recovery percentage X and the model – We might have a 5% chance of falsely predicting a charged off loan as fully paid. Hence, the investor might lose his money when investing in loans. However, we have seen that there X% recoveries of charged off loans, hence entire amount will not be lost. However had the same amount been deposited in alternative investment like CD, Savings account for 3 years (which could potentially give a return of 2%), hence actual return after 3 years 106$ for a 100$ investment. This would be the actual loss calculation. Also one of the reasons we have been concentrating on minimizing the loss while making reasonable return. The following shows C5 rules and Random Forest weighted model is predicting well for our case of study based on the cost matrix we created.
Tuning the cost matrix will give different results and different best models. Selection can depend on use case.